Skip to content

Commit 6a8c62f

Browse files
author
Barbara Miller
committed
recent_video_capture_exists
1 parent 6baaf4a commit 6a8c62f

File tree

2 files changed

+42
-36
lines changed

2 files changed

+42
-36
lines changed

brozzler/video_data.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
limitations under the License.
1717
"""
1818

19+
import datetime
1920
import os
2021
from dataclasses import dataclass
21-
from typing import Any, List, Optional
22+
from typing import Any, Bool, List, Optional
2223

2324
import structlog
2425
import urlcanon
@@ -83,7 +84,24 @@ def _execute_pg_query(self, query_tuple, fetchall=False) -> Optional[Any]:
8384
logger.warn("postgres query failed: %s", e)
8485
return None
8586

86-
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
87+
def _timestamp4datetime(timestamp):
88+
"""split `timestamp` into a tuple of 6 integers.
89+
90+
:param timestamp: full-length timestamp
91+
"""
92+
timestamp = timestamp[:14]
93+
return (
94+
int(timestamp[:-10]),
95+
int(timestamp[-10:-8]),
96+
int(timestamp[-8:-6]),
97+
int(timestamp[-6:-4]),
98+
int(timestamp[-4:-2]),
99+
int(timestamp[-2:]),
100+
)
101+
102+
def recent_video_capture_exists(
103+
self, site=None, containing_page_url=None, recent=30
104+
) -> Bool:
87105
# using ait_account_id as postgres partition id
88106
partition_id = (
89107
site["metadata"]["ait_account_id"]
@@ -93,7 +111,7 @@ def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
93111
seed_id = (
94112
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
95113
)
96-
result = None
114+
result = False
97115

98116
if partition_id and seed_id and containing_page_url:
99117
# check for postgres query for most recent record
@@ -105,7 +123,20 @@ def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
105123
result_tuple = self._execute_pg_query(pg_query)
106124
if result_tuple:
107125
result = result_tuple[0]
108-
logger.info("found most recent video capture record: %s", result)
126+
logger.info("found most recent capture timestamp: %s", result)
127+
capture_timestamp = datetime.datetime(
128+
*self._timestamp4datetime(result)
129+
)
130+
time_diff = (
131+
datetime.datetime.now(datetime.timezone.utc)()
132+
- capture_timestamp
133+
)
134+
if time_diff < datetime.timedelta(recent):
135+
logger.info(
136+
"recent video capture from %s exists",
137+
containing_page_url,
138+
)
139+
result = True
109140

110141
except Exception as e:
111142
logger.warn("postgres query failed: %s", e)

brozzler/worker.py

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -279,21 +279,6 @@ def thumb_jpeg(self, full_jpeg):
279279
img.save(out, "jpeg", quality=95)
280280
return out.getbuffer()
281281

282-
def _timestamp4datetime(timestamp):
283-
"""split `timestamp` into a tuple of 6 integers.
284-
285-
:param timestamp: full-length timestamp
286-
"""
287-
timestamp = timestamp[:14]
288-
return (
289-
int(timestamp[:-10]),
290-
int(timestamp[-10:-8]),
291-
int(timestamp[-8:-6]),
292-
int(timestamp[-6:-4]),
293-
int(timestamp[-4:-2]),
294-
int(timestamp[-2:]),
295-
)
296-
297282
def should_ytdlp(self, logger, site, page, page_status):
298283
# called only after we've passed needs_browsing() check
299284

@@ -316,25 +301,15 @@ def should_ytdlp(self, logger, site, page, page_status):
316301
logger.info("checking for recent previous captures of %s", ytdlp_url)
317302
if "youtube.com/watch" in ytdlp_url:
318303
try:
319-
previous_capture = self._video_data.get_recent_video_capture(
320-
site, ytdlp_url
304+
recent_capture_exists = self._video_data.recent_video_capture_exists(
305+
site, ytdlp_url, recent=90
321306
)
322-
if previous_capture:
323-
capture_timestamp = datetime.datetime(
324-
*self._timestamp4datetime(previous_capture)
325-
)
326-
logger.info("capture_timestamp: %s", capture_timestamp)
327-
time_diff = (
328-
datetime.datetime.now(datetime.timezone.utc)()
329-
- capture_timestamp
307+
if recent_capture_exists:
308+
logger.info(
309+
"recent previous capture of %s found, skipping ytdlp",
310+
ytdlp_url,
330311
)
331-
# TODO: make variable for timedelta
332-
if time_diff < datetime.timedelta(days=90):
333-
logger.info(
334-
"skipping ytdlp for %s since there's a recent capture",
335-
ytdlp_url,
336-
)
337-
return False
312+
return False
338313
except Exception as e:
339314
logger.warning(
340315
"exception querying for previous capture for %s: %s",

0 commit comments

Comments
 (0)