Skip to content

Commit b00f378

Browse files
authored
feature/add-video-trimming-via-transcription-ranges-to-session (#221)
* Add transcription range fields to database and ingestion models, add validator for time duration, add range filter to ffmpeg audio split, update tests * Add tests for edge case transcription ranges * Fix end_time=start_time typo, update tests to be more concise, add ffmpeg error logging * Updated transcription range to video range, updated video handling to host when limiting video to a range, updated mp4 conversion to allow a range, connected mp4 to clip functionality, updated tests and tried to make testing slightly more consistent, added Session ingestion verification to test out * Update session hash to reflect trimmed video * Bypass hash task * Remove unnecessary logging, duration validation comments, elif typo fix in cdp_will_host control structure * Reverted function parameter doc for split audio * Improved documentation for video_start_time in ingestion_models
1 parent 2d080b4 commit b00f378

9 files changed

+240
-60
lines changed

cdp_backend/database/models.py

+4
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,8 @@ class Session(Model):
494494
session_index = fields.NumberField(required=True)
495495
session_content_hash = fields.TextField(required=True)
496496
video_uri = fields.TextField(required=True, validator=validators.resource_exists)
497+
video_start_time = fields.TextField(validators.time_duration_is_valid)
498+
video_end_time = fields.TextField(validators.time_duration_is_valid)
497499
caption_uri = fields.TextField(validator=validators.resource_exists)
498500
external_source_id = fields.TextField()
499501

@@ -515,6 +517,8 @@ def Example(cls) -> Model:
515517
session.video_uri = (
516518
"https://video.seattle.gov/media/council/brief_072219_2011957V.mp4"
517519
)
520+
session.video_start_time = "01:00:00"
521+
session.video_end_time = "99:59:59"
518522
session.session_content_hash = (
519523
"05bd857af7f70bf51b6aac1144046973bf3325c9101a554bc27dc9607dbbd8f5"
520524
)

cdp_backend/database/validators.py

+28
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,34 @@ def router_string_is_valid(router_string: Optional[str]) -> bool:
7272
return False
7373

7474

75+
def time_duration_is_valid(time_duration: Optional[str]) -> bool:
76+
"""
77+
Validate that the provided time duration string is acceptable to FFmpeg.
78+
The validator is unnecessarily limited to HH:MM:SS. The spec is a little
79+
more flexible.
80+
81+
None is a valid option.
82+
83+
Parameters
84+
----------
85+
time_duration: Optional[str]
86+
The time duration to validate.
87+
88+
Returns
89+
-------
90+
status: bool
91+
The validation status.
92+
"""
93+
if time_duration is None:
94+
return True
95+
96+
# HH:MM:SS
97+
if re.match(r"^((((\d{1,2}:)?[0-5])?\d:)?[0-5])?\d$", time_duration):
98+
return True
99+
100+
return False
101+
102+
75103
def email_is_valid(email: Optional[str]) -> bool:
76104
"""
77105
Validate that a valid email was provided.

cdp_backend/pipeline/event_gather_pipeline.py

+32-13
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,13 @@ def create_event_gather_flow(
130130
# Download video to local copy
131131
resource_copy_filepath = resource_copy_task(uri=session.video_uri)
132132

133-
# Get unique session identifier
134-
session_content_hash = get_session_content_hash(
135-
tmp_video_filepath=resource_copy_filepath,
136-
)
137-
138133
# Handle video conversion or non-secure resource
139134
# hosting
140135
(
141136
tmp_video_filepath,
142137
session_video_hosted_url,
138+
session_content_hash,
143139
) = convert_video_and_handle_host(
144-
session_content_hash=session_content_hash,
145140
video_filepath=resource_copy_filepath,
146141
session=session,
147142
credentials_file=config.google_credentials_file,
@@ -293,14 +288,13 @@ def get_session_content_hash(
293288
return file_utils.hash_file_contents(uri=tmp_video_filepath)
294289

295290

296-
@task(nout=2)
291+
@task(nout=3)
297292
def convert_video_and_handle_host(
298-
session_content_hash: str,
299293
video_filepath: str,
300294
session: Session,
301295
credentials_file: str,
302296
bucket: str,
303-
) -> Tuple[str, str]:
297+
) -> Tuple[str, str, str]:
304298
"""
305299
Convert a video to MP4 (if necessary), upload it to the file store, and remove
306300
the original non-MP4 file that was resource copied.
@@ -330,19 +324,41 @@ def convert_video_and_handle_host(
330324
# Get file extension
331325
ext = Path(video_filepath).suffix.lower()
332326

327+
trim_video = bool(session.video_start_time or session.video_end_time)
328+
333329
# Convert to mp4 if file isn't of approved web format
334330
cdp_will_host = False
335331
if ext not in [".mp4", ".webm"]:
336332
cdp_will_host = True
337333

338334
# Convert video to mp4
339-
mp4_filepath = file_utils.convert_video_to_mp4(video_filepath)
335+
mp4_filepath = file_utils.convert_video_to_mp4(
336+
video_filepath=Path(video_filepath),
337+
start_time=session.video_start_time,
338+
end_time=session.video_end_time,
339+
)
340340

341-
# Remove old mkv file
342341
fs_functions.remove_local_file(video_filepath)
343342

344343
# Update variable name for easier downstream typing
345-
video_filepath = mp4_filepath
344+
video_filepath = str(mp4_filepath)
345+
346+
# host trimmed videos because it's simpler than setting
347+
# up transcription and playback ranges
348+
elif trim_video:
349+
cdp_will_host = True
350+
351+
# Trim video
352+
trimmed_filepath = file_utils.clip_and_reformat_video(
353+
video_filepath=Path(video_filepath),
354+
start_time=session.video_start_time,
355+
end_time=session.video_end_time,
356+
)
357+
358+
fs_functions.remove_local_file(video_filepath)
359+
360+
# Update variable name for easier downstream typing
361+
video_filepath = str(trimmed_filepath)
346362

347363
# Check if original session video uri is a m3u8
348364
# We cant follow the normal coonvert video process from above
@@ -370,6 +386,9 @@ def convert_video_and_handle_host(
370386
else:
371387
hosted_video_media_url = session.video_uri
372388

389+
# Get unique session identifier
390+
session_content_hash = file_utils.hash_file_contents(uri=video_filepath)
391+
373392
# Upload and swap if cdp is hosting
374393
if cdp_will_host:
375394
# Upload to gcsfs
@@ -387,7 +406,7 @@ def convert_video_and_handle_host(
387406
uri=hosted_video_uri,
388407
)
389408

390-
return video_filepath, hosted_video_media_url
409+
return video_filepath, hosted_video_media_url, session_content_hash
391410

392411

393412
@task

cdp_backend/pipeline/ingestion_models.py

+27
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,39 @@ class Session(IngestionModel, DataClassJsonMixin):
134134
"""
135135
A session is a working period for an event.
136136
For example, an event could have a morning and afternoon session.
137+
138+
Notes
139+
-----
140+
video_start_time is a duration relative to the beginning of the video in
141+
HH:MM:SS format. It does not affect nor is relative to session_datetime
142+
or any other datetime. If the portion of the video relavent to the session
143+
begins 37m50s into the full video, video_start_time will be "37:50".
144+
An absent start time is equivalent to the beginning of the video, and an
145+
absent end time is equivalent to the end of the video, so either can be omitted.
137146
"""
138147

139148
session_datetime: datetime
140149
video_uri: str
141150
session_index: int
151+
video_start_time: Optional[str] = None
152+
video_end_time: Optional[str] = None
142153
caption_uri: Optional[str] = None
143154
external_source_id: Optional[str] = None
144155

156+
def __post_init__(self) -> None:
157+
# validate start/end time pair during ingestion
158+
if self.video_start_time and self.video_end_time:
159+
# fill in potentially missing hh:mm:s
160+
# for flexible input format [h[h:[m[m:[s]]]]]s
161+
start = list(map(int, ("00:00:0" + self.video_start_time).split(":")))
162+
end = list(map(int, ("00:00:0" + self.video_end_time).split(":")))
163+
start.reverse()
164+
end.reverse()
165+
start_seconds = start[0] + start[1] * 60 + start[2] * 3600
166+
end_seconds = end[0] + end[1] * 60 + end[2] * 3600
167+
if start_seconds >= end_seconds:
168+
raise ValueError("start_time must be less than end_time if both exist")
169+
145170

146171
@dataclass
147172
class Body(IngestionModel, DataClassJsonMixin):
@@ -263,6 +288,8 @@ class EventIngestionModel(IngestionModel, DataClassJsonMixin):
263288
video_uri=(
264289
"https://video.seattle.gov/media/council/council_113020_2022091V.mp4"
265290
),
291+
video_start_time=("00:00:00"),
292+
video_end_time=("99:59:59"),
266293
caption_uri=(
267294
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_113020_2022091.vtt" # noqa: E501
268295
),

cdp_backend/pipeline/mock_get_events.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,33 @@
5454
(
5555
"https://video.seattle.gov/media/council/council_010421_2022101V.mp4",
5656
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2021/council_010421_2022101.vtt", # noqa
57+
None,
58+
None,
5759
),
5860
(
5961
"https://video.seattle.gov/media/council/council_113020_2022091V.mp4",
6062
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_113020_2022091.vtt", # noqa
63+
"1",
64+
"25:25",
6165
),
6266
(
6367
"https://video.seattle.gov/media/council/council_112320_2022089V.mp4",
6468
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/brief_112320_2012089.vtt", # noqa
69+
None,
70+
"2:58:14",
6571
),
6672
(
6773
"https://video.seattle.gov/media/council/council_110920_2022085V.mp4",
6874
"https://www.seattlechannel.org/documents/seattlechannel/closedcaption/2020/council_110920_2022085.vtt", # noqa
75+
"1",
76+
None,
77+
),
78+
(
79+
"https://video.seattle.gov/media/council/council_101220_2022077V.mp4",
80+
None,
81+
None,
82+
None,
6983
),
70-
("https://video.seattle.gov/media/council/council_101220_2022077V.mp4", None),
7184
]
7285

7386

@@ -121,6 +134,8 @@ def _get_example_event() -> EventIngestionModel:
121134
session_datetime=datetime.utcnow() + (i * timedelta(hours=3)),
122135
session_index=i,
123136
video_uri=session[0],
137+
video_start_time=session[2],
138+
video_end_time=session[3],
124139
caption_uri=session[1],
125140
)
126141
for i, session in enumerate(random.sample(SESSIONS, random.randint(1, 3)))

cdp_backend/tests/database/test_validators.py

+33
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,39 @@ def test_router_string_is_valid(router_string: str, expected_result: bool) -> No
4848
assert actual_result == expected_result
4949

5050

51+
@pytest.mark.parametrize(
52+
"time_duration, expected_result",
53+
[
54+
(None, True),
55+
("1", True),
56+
("11", True),
57+
("1:11", True),
58+
("11:11", True),
59+
("1:11:11", True),
60+
("99:59:59", True),
61+
("0", True),
62+
("00", True),
63+
("0:00", True),
64+
("00:00", True),
65+
("0:00:00", True),
66+
("00:00:00", True),
67+
("111", False),
68+
("11:1", False),
69+
("111:11", False),
70+
("11:1:11", False),
71+
("11:11:1", False),
72+
("111:11:11", False),
73+
("60", False),
74+
("60:00", False),
75+
("1:60:00", False),
76+
("1:00:60", False),
77+
],
78+
)
79+
def test_time_duration_is_valid(time_duration: str, expected_result: bool) -> None:
80+
actual_result = validators.time_duration_is_valid(time_duration)
81+
assert actual_result == expected_result
82+
83+
5184
@pytest.mark.parametrize(
5285
"email, expected_result",
5386
[

cdp_backend/tests/pipeline/test_event_gather_pipeline.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,7 @@ def test_store_event_processing_results(
580580
@mock.patch(f"{PIPELINE_PATH}.fs_functions.upload_file")
581581
@mock.patch(f"{PIPELINE_PATH}.fs_functions.get_open_url_for_gcs_file")
582582
@mock.patch(f"{PIPELINE_PATH}.fs_functions.remove_local_file")
583+
@mock.patch(f"{PIPELINE_PATH}.file_utils.hash_file_contents")
583584
@mock.patch(f"{PIPELINE_PATH}.file_utils.convert_video_to_mp4")
584585
@pytest.mark.parametrize(
585586
"video_filepath, session, expected_filepath, expected_hosted_video_url",
@@ -618,6 +619,7 @@ def test_store_event_processing_results(
618619
)
619620
def test_convert_video_and_handle_host(
620621
mock_convert_video_to_mp4: MagicMock,
622+
mock_hash_file_contents: MagicMock,
621623
mock_remove_local_file: MagicMock,
622624
mock_generate_url: MagicMock,
623625
mock_upload_file: MagicMock,
@@ -629,12 +631,13 @@ def test_convert_video_and_handle_host(
629631
mock_upload_file.return_value = "file_store_uri"
630632
mock_generate_url.return_value = "hosted-video.mp4"
631633
mock_convert_video_to_mp4.return_value = expected_filepath
634+
mock_hash_file_contents.return_value = "abc123"
632635

633636
(
634637
mp4_filepath,
635638
session_video_hosted_url,
639+
session_content_hash,
636640
) = pipeline.convert_video_and_handle_host.run(
637-
session_content_hash="abc123",
638641
video_filepath=video_filepath,
639642
session=session,
640643
credentials_file="fake/credentials.json",

0 commit comments

Comments
 (0)