Skip to content

Commit aa8ff15

Browse files
authored
bugfix/unique-temp-filenames (#225)
* Add transcription range fields to database and ingestion models, add validator for time duration, add range filter to ffmpeg audio split, update tests * Add tests for edge case transcription ranges * Fix end_time=start_time typo, update tests to be more concise, add ffmpeg error logging * Updated transcription range to video range, updated video handling to host when limiting video to a range, updated mp4 conversion to allow a range, connected mp4 to clip functionality, updated tests and tried to make testing slightly more consistent, added Session ingestion verification to test out * Update session hash to reflect trimmed video * Bypass hash task * Remove unnecessary logging, duration validation comments, elif typo fix in cdp_will_host control structure * Reverted function parameter doc for split audio * Improved documentation for video_start_time in ingestion_models * Use content hash to name videos and prevent collisions across sessions * Use pathlib functions from 3.8, add type annotations for mock function * Add return type annotations for mock function * Lint updates * Move file renaming to earliest point, return unique names from file conversion functions * UUID for original file resource copy task so to prevent collisions earlier in the process * Minor comment change to force CI reprocessing * Stop renaming resource within task * Lint * Flag for adding source suffix in resource copy * Log file status after copy for debugging * Logging to debug file copy * Logging to debug file copy * Logging to debug file copy * Logging to debug file copy * Remove rename
1 parent b00f378 commit aa8ff15

File tree

4 files changed

+171
-24
lines changed

4 files changed

+171
-24
lines changed

cdp_backend/pipeline/event_gather_pipeline.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from operator import attrgetter
88
from pathlib import Path
99
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
10+
from uuid import uuid4
1011

1112
from aiohttp.client_exceptions import ClientResponseError
1213
from fireo.fields.errors import FieldValidationFailed, InvalidFieldType, RequiredField
@@ -127,8 +128,13 @@ def create_event_gather_flow(
127128
for event in events:
128129
session_processing_results: List[SessionProcessingResult] = []
129130
for session in event.sessions:
130-
# Download video to local copy
131-
resource_copy_filepath = resource_copy_task(uri=session.video_uri)
131+
# Download video to local copy making
132+
# copy unique in case of shared session video
133+
resource_copy_filepath = resource_copy_task(
134+
uri=session.video_uri,
135+
dst=f"{str(uuid4())}_temp",
136+
copy_suffix=True,
137+
)
132138

133139
# Handle video conversion or non-secure resource
134140
# hosting
@@ -229,7 +235,7 @@ def create_event_gather_flow(
229235

230236

231237
@task(max_retries=3, retry_delay=timedelta(seconds=120))
232-
def resource_copy_task(uri: str) -> str:
238+
def resource_copy_task(uri: str, dst: str = None, copy_suffix: bool = None) -> str:
233239
"""
234240
Copy a file to a temporary location for processing.
235241
@@ -250,6 +256,8 @@ def resource_copy_task(uri: str) -> str:
250256
"""
251257
return file_utils.resource_copy(
252258
uri=uri,
259+
dst=dst,
260+
copy_suffix=copy_suffix,
253261
overwrite=True,
254262
)
255263

cdp_backend/tests/pipeline/test_event_gather_pipeline.py

+22-16
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88
from typing import List, Optional
99
from unittest import mock
10-
from unittest.mock import MagicMock
10+
from unittest.mock import MagicMock, patch
1111

1212
import pytest
1313
from prefect import Flow
@@ -577,6 +577,10 @@ def test_store_event_processing_results(
577577
EXISTING_REMOTE_M3U8_MINIMAL_EVENT.sessions[0].video_uri = EXAMPLE_M3U8_PLAYLIST_URI
578578

579579

580+
def path_rename(self: Path, newPath: Path) -> Path:
581+
return newPath
582+
583+
580584
@mock.patch(f"{PIPELINE_PATH}.fs_functions.upload_file")
581585
@mock.patch(f"{PIPELINE_PATH}.fs_functions.get_open_url_for_gcs_file")
582586
@mock.patch(f"{PIPELINE_PATH}.fs_functions.remove_local_file")
@@ -633,20 +637,22 @@ def test_convert_video_and_handle_host(
633637
mock_convert_video_to_mp4.return_value = expected_filepath
634638
mock_hash_file_contents.return_value = "abc123"
635639

636-
(
637-
mp4_filepath,
638-
session_video_hosted_url,
639-
session_content_hash,
640-
) = pipeline.convert_video_and_handle_host.run(
641-
video_filepath=video_filepath,
642-
session=session,
643-
credentials_file="fake/credentials.json",
644-
bucket="doesnt://matter",
645-
)
640+
with patch.object(Path, "rename", path_rename):
641+
642+
(
643+
mp4_filepath,
644+
session_video_hosted_url,
645+
session_content_hash,
646+
) = pipeline.convert_video_and_handle_host.run(
647+
video_filepath=video_filepath,
648+
session=session,
649+
credentials_file="fake/credentials.json",
650+
bucket="doesnt://matter",
651+
)
646652

647-
# Make sure mp4 files don't go through conversion
648-
if Path(video_filepath).suffix == ".mp4":
649-
assert not mock_convert_video_to_mp4.called
653+
# Make sure mp4 files don't go through conversion
654+
if Path(video_filepath).suffix == ".mp4":
655+
assert not mock_convert_video_to_mp4.called
650656

651-
assert mp4_filepath == expected_filepath
652-
assert session_video_hosted_url == expected_hosted_video_url
657+
assert mp4_filepath == str(Path(video_filepath).with_suffix(".mp4"))
658+
assert session_video_hosted_url == expected_hosted_video_url

cdp_backend/tests/utils/test_file_utils.py

+52
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,58 @@
3535
#############################################################################
3636

3737

38+
@pytest.mark.parametrize(
39+
"path, stem, expected_result",
40+
[
41+
(Path("file.ext"), "new", "new.ext"),
42+
],
43+
)
44+
def test_with_stem(path: Path, stem: str, expected_result: str) -> None:
45+
new_path = file_utils.with_stem(path, stem)
46+
assert str(new_path) == expected_result
47+
48+
49+
@pytest.mark.parametrize(
50+
"path, addition, expected_result",
51+
[
52+
(Path("file.ext"), "-new", "file-new.ext"),
53+
],
54+
)
55+
def test_append_to_stem(path: Path, addition: str, expected_result: str) -> None:
56+
new_path = file_utils.append_to_stem(path, addition)
57+
assert str(new_path) == expected_result
58+
59+
60+
@pytest.mark.parametrize(
61+
"path, stem, expected_result",
62+
[
63+
(Path("file.ext"), "new", "new.ext"),
64+
],
65+
)
66+
def test_rename_with_stem(path: Path, stem: str, expected_result: str) -> None:
67+
file = open(path, "w")
68+
file.close()
69+
new_path = file_utils.rename_with_stem(path, stem)
70+
assert str(new_path) == expected_result
71+
assert new_path.exists()
72+
os.remove(new_path)
73+
74+
75+
@pytest.mark.parametrize(
76+
"path, addition, expected_result",
77+
[
78+
(Path("file.ext"), "-new", "file-new.ext"),
79+
],
80+
)
81+
def test_rename_append_to_stem(path: Path, addition: str, expected_result: str) -> None:
82+
file = open(path, "w")
83+
file.close()
84+
new_path = file_utils.rename_append_to_stem(path, addition)
85+
assert str(new_path) == expected_result
86+
assert new_path.exists()
87+
os.remove(new_path)
88+
89+
3890
@pytest.mark.parametrize(
3991
"uri, expected_result",
4092
[

cdp_backend/utils/file_utils.py

+86-5
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,82 @@
2828
MAX_THUMBNAIL_WIDTH = 960
2929

3030

31+
def with_stem(path: Path, stem: str) -> Path:
32+
"""
33+
Create a path with a new stem
34+
35+
Parameters
36+
----------
37+
path: Path
38+
The path to alter
39+
stem: str
40+
The string to be the new stem of the path
41+
42+
Returns
43+
-------
44+
path: Path
45+
The new path with the replaced stem
46+
"""
47+
return path.with_name(f"{stem}{path.suffix}")
48+
49+
50+
def append_to_stem(path: Path, addition: str) -> Path:
51+
"""
52+
Rename a file with a string appended to the path stem
53+
54+
Parameters
55+
----------
56+
path: Path
57+
The path to alter
58+
addition: str
59+
The string to be appended to the path stem
60+
61+
Returns
62+
-------
63+
path: Path
64+
The new path with the stem addition
65+
"""
66+
return with_stem(path, f"{path.stem}{addition}")
67+
68+
69+
def rename_with_stem(path: Path, stem: str) -> Path:
70+
"""
71+
Rename a file with a string appended to the path stem
72+
73+
Parameters
74+
----------
75+
path: Path
76+
The path to be renamed
77+
stem: str
78+
The string to become the new stem
79+
80+
Returns
81+
-------
82+
path: Path
83+
The new path of the renamed file
84+
"""
85+
return path.rename(with_stem(path, stem))
86+
87+
88+
def rename_append_to_stem(path: Path, addition: str) -> Path:
89+
"""
90+
Rename a file with a string appended to the path stem
91+
92+
Parameters
93+
----------
94+
path: Path
95+
The path to be renamed
96+
addition: str
97+
The string to be appended to the path stem
98+
99+
Returns
100+
-------
101+
path: Path
102+
The new path of the renamed file
103+
"""
104+
return path.rename(append_to_stem(path, addition))
105+
106+
31107
def get_media_type(uri: str) -> Optional[str]:
32108
"""
33109
Get the IANA media type for the provided URI.
@@ -69,6 +145,7 @@ def get_media_type(uri: str) -> Optional[str]:
69145
def resource_copy(
70146
uri: str,
71147
dst: Optional[Union[str, Path]] = None,
148+
copy_suffix: Optional[bool] = False,
72149
overwrite: bool = False,
73150
) -> str:
74151
"""
@@ -90,6 +167,7 @@ def resource_copy(
90167
saved_path: str
91168
The path of where the resource ended up getting copied to.
92169
"""
170+
uri_suffix = Path(uri.split("/")[-1].split("?")[0].split("#")[0]).suffix
93171
if dst is None:
94172
dst = uri.split("/")[-1]
95173

@@ -103,10 +181,13 @@ def resource_copy(
103181
# Split by the last "/"
104182
dst = dst / uri.split("/")[-1]
105183

184+
if copy_suffix:
185+
dst = dst.with_suffix(uri_suffix)
186+
106187
# Ensure filename is less than 255 chars
107188
# Otherwise this can raise an OSError for too long of a filename
108189
if len(dst.name) > 255:
109-
dst = Path(str(dst)[:255])
190+
dst = with_stem(dst, dst.stem[: (255 - len(dst.suffix))])
110191

111192
# Ensure dest isn't a file
112193
if dst.is_file() and not overwrite:
@@ -148,6 +229,7 @@ def resource_copy(
148229
# It was added because it's very common for SSL certs to be bad
149230
# See: https://github.com/CouncilDataProject/cdp-scrapers/pull/85
150231
# And: https://github.com/CouncilDataProject/seattle/runs/5957646032
232+
151233
with open(dst, "wb") as open_dst:
152234
open_dst.write(
153235
requests.get(
@@ -520,7 +602,6 @@ def convert_video_to_mp4(
520602
The end time to trim the video in HH:MM:SS.
521603
output_path: Path
522604
The output path to place the clip at.
523-
Must include a suffix to use for the reformatting.
524605
525606
Returns
526607
-------
@@ -605,7 +686,7 @@ def clip_and_reformat_video(
605686
video_filepath: Path,
606687
start_time: Optional[str],
607688
end_time: Optional[str],
608-
output_path: Path = Path("clipped.mp4"),
689+
output_path: Path = None,
609690
output_format: str = "mp4",
610691
) -> Path:
611692
"""
@@ -621,8 +702,6 @@ def clip_and_reformat_video(
621702
The end time of the clip in HH:MM:SS.
622703
output_path: Path
623704
The output path to place the clip at.
624-
Must include a suffix to use for the reformatting.
625-
Default: "clipped.mp4"
626705
output_format: str
627706
The output format.
628707
Default: "mp4"
@@ -634,6 +713,8 @@ def clip_and_reformat_video(
634713
"""
635714
import ffmpeg
636715

716+
output_path = output_path or append_to_stem(video_filepath, "_clipped")
717+
637718
try:
638719
ffmpeg_stdout, ffmpeg_stderr = (
639720
ffmpeg.input(

0 commit comments

Comments
 (0)