Skip to content

Commit 703d7f1

Browse files
authored
Use requests stream and shutil.copyfileobj to constrain memory usage during resource copy (#236)
* Use requests stream and shutil.copyfileobj to constrain memory usage - When copying resources from a remote origin over HTTP(S) prefer to stream the response body and copy chunks into the destination file insead of loading the entire file into memory first before writing. * Fix format/linting * Fix ruff import re-ordering * Remove with; use finally to close response * Add unit test for resource_copy from https origin - Add requests_mock dependency so that we can mock an https origin with an mp4 file.
1 parent 0982830 commit 703d7f1

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

cdp_backend/tests/utils/test_file_utils.py

+14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import imageio
1313
import pytest
14+
import requests_mock
1415

1516
from cdp_backend.utils import file_utils
1617
from cdp_backend.utils.file_utils import (
@@ -114,6 +115,19 @@ def test_resource_copy(tmpdir, example_video: Path) -> None: # type: ignore
114115
resource_copy(str(example_video), save_path)
115116

116117

118+
def test_resource_copy_https(tmpdir, example_video: Path) -> None: # type: ignore
119+
with requests_mock.Mocker() as mock:
120+
example_video_file = imageio.read(example_video)
121+
mock.get("https://example.com/example_video.mp4", body=example_video_file)
122+
dest = tmpdir / "example_video.mp4"
123+
saved_path = resource_copy(
124+
"https://example.com/example_video.mp4",
125+
dest,
126+
)
127+
assert saved_path == dest
128+
example_video_file.close()
129+
130+
117131
# Type ignore because changing tmpdir typing
118132
def test_hash_file_contents(tmpdir) -> None: # type: ignore
119133
test_file = Path(tmpdir) / "a.txt"

cdp_backend/utils/file_utils.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import math
77
import random
88
import re
9+
import shutil
910
from hashlib import sha256
1011
from pathlib import Path
1112
from uuid import uuid4
@@ -225,21 +226,27 @@ def resource_copy( # noqa: C901
225226
)
226227
return str(dst)
227228

228-
# Set custom timeout for http resources
229+
# Common case: http(s) URI
229230
if uri.startswith("http"):
230231
# The verify=False is passed to any http URIs
231232
# It was added because it's very common for SSL certs to be bad
232233
# See: https://github.com/CouncilDataProject/cdp-scrapers/pull/85
233234
# And: https://github.com/CouncilDataProject/seattle/runs/5957646032
234235

235-
with open(dst, "wb") as open_dst:
236-
open_dst.write(
237-
requests.get(
238-
uri,
239-
verify=False,
240-
timeout=1800,
241-
).content
242-
)
236+
# Use stream=True to avoid downloading the entire file into memory
237+
# See: https://github.com/CouncilDataProject/cdp-backend/issues/235
238+
try:
239+
# This response must be closed after the copy is done. But using
240+
# `with requests.get() as response` fails mypy type checking.
241+
# See: https://requests.readthedocs.io/en/latest/user/advanced/#body-content-workflow
242+
response = requests.get(uri, stream=True, verify=False, timeout=1800)
243+
response.raise_for_status()
244+
with open(dst, "wb") as open_dst:
245+
shutil.copyfileobj(
246+
response.raw, open_dst, length=64 * 1024 * 1024 # 64MB chunks
247+
)
248+
finally:
249+
response.close()
243250

244251
else:
245252
# TODO: Add explicit use of GCS credentials until public read is fixed

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ test = [
103103
# Extras
104104
"networkx>=2.5",
105105
"pydot>=1.4",
106+
"requests-mock>=1.10.0"
106107
]
107108
docs = [
108109
# Sphinx + Doc Gen + Styling

0 commit comments

Comments
 (0)