Skip to content

Commit 7f48a1f

Browse files
committed
feat(freshness): add cache-item based freshness helpers
- add _freshness module with etag/last-modified/hash/size checks\n- add metadata_from_item() to extract freshness fields from CacheItem attrs\n- add tests for metadata extraction and size-based freshness checks
1 parent 7d2b5a0 commit 7f48a1f

2 files changed

Lines changed: 230 additions & 0 deletions

File tree

cache_manager/_freshness.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
Freshness checking utilities for cache-backed downloads.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
import os
8+
import hashlib
9+
from email.utils import parsedate_to_datetime
10+
from typing import TYPE_CHECKING
11+
12+
if TYPE_CHECKING:
13+
from pathlib import Path
14+
15+
from cache_manager._session import _log
16+
17+
18+
def metadata_from_item(item) -> dict:
19+
"""
20+
Extracts freshness-relevant metadata from a CacheItem.
21+
"""
22+
23+
attrs = getattr(item, 'attrs', {}) or {}
24+
headers = attrs.get('resp_headers', {}) or {}
25+
26+
return {
27+
'etag': attrs.get('etag') or headers.get('ETag') or headers.get('etag'),
28+
'last_modified': (
29+
attrs.get('last_modified') or
30+
headers.get('Last-Modified') or
31+
headers.get('last-modified')
32+
),
33+
'sha256': attrs.get('sha256'),
34+
'size': attrs.get('size'),
35+
}
36+
37+
38+
def check_freshness(
39+
local_path: str | Path,
40+
remote_headers: dict,
41+
local_metadata: dict | None = None,
42+
method: str = 'auto',
43+
) -> tuple[bool, str]:
44+
45+
if not os.path.exists(local_path):
46+
return False, 'local file does not exist'
47+
48+
local_metadata = local_metadata or {}
49+
50+
if method == 'auto':
51+
for check_method in ('etag', 'modified', 'size'):
52+
is_current, reason = _check_by_method(
53+
local_path,
54+
remote_headers,
55+
local_metadata,
56+
check_method,
57+
)
58+
if reason != 'method_unavailable':
59+
return is_current, f'{check_method}: {reason}'
60+
return False, 'no check method available'
61+
62+
return _check_by_method(local_path, remote_headers, local_metadata, method)
63+
64+
65+
def _check_by_method(
66+
local_path: str | Path,
67+
remote_headers: dict,
68+
local_metadata: dict,
69+
method: str,
70+
) -> tuple[bool, str]:
71+
72+
if method == 'etag':
73+
return _check_etag(remote_headers, local_metadata)
74+
if method == 'modified':
75+
return _check_last_modified(remote_headers, local_metadata)
76+
if method == 'hash':
77+
return _check_hash(local_path, remote_headers, local_metadata)
78+
if method == 'size':
79+
return _check_size(local_path, remote_headers)
80+
81+
return False, f'unknown method: {method}'
82+
83+
84+
def _check_etag(remote_headers: dict, local_metadata: dict) -> tuple[bool, str]:
85+
86+
remote_etag = remote_headers.get('ETag') or remote_headers.get('etag')
87+
local_etag = local_metadata.get('etag')
88+
89+
if not remote_etag:
90+
return False, 'method_unavailable'
91+
if not local_etag:
92+
return False, 'no local etag stored'
93+
94+
is_current = remote_etag == local_etag
95+
_log(f'ETag check: remote={remote_etag}, local={local_etag}, current={is_current}')
96+
return is_current, 'etag match' if is_current else 'etag mismatch'
97+
98+
99+
def _check_last_modified(
100+
remote_headers: dict,
101+
local_metadata: dict,
102+
) -> tuple[bool, str]:
103+
104+
remote_modified = (
105+
remote_headers.get('Last-Modified') or
106+
remote_headers.get('last-modified')
107+
)
108+
local_modified = local_metadata.get('last_modified')
109+
110+
if not remote_modified:
111+
return False, 'method_unavailable'
112+
if not local_modified:
113+
return False, 'no local last-modified stored'
114+
115+
try:
116+
remote_dt = parsedate_to_datetime(remote_modified)
117+
local_dt = parsedate_to_datetime(local_modified)
118+
is_current = local_dt >= remote_dt
119+
_log(f'Last-Modified check: remote={remote_dt}, local={local_dt}, current={is_current}')
120+
return is_current, 'not modified' if is_current else 'modified'
121+
except (ValueError, TypeError) as e:
122+
_log(f'Error parsing dates: {e}')
123+
return False, f'date parse error: {e}'
124+
125+
126+
def _check_hash(
127+
local_path: str | Path,
128+
remote_headers: dict,
129+
local_metadata: dict,
130+
) -> tuple[bool, str]:
131+
132+
remote_md5 = remote_headers.get('Content-MD5') or remote_headers.get('content-md5')
133+
134+
if remote_md5:
135+
local_md5 = _compute_hash(local_path, 'md5')
136+
is_current = remote_md5 == local_md5
137+
_log(f'MD5 check: remote={remote_md5}, local={local_md5}, current={is_current}')
138+
return is_current, 'md5 match' if is_current else 'md5 mismatch'
139+
140+
local_sha256 = local_metadata.get('sha256')
141+
if local_sha256:
142+
return False, 'hash check requires download'
143+
144+
return False, 'method_unavailable'
145+
146+
147+
def _check_size(local_path: str | Path, remote_headers: dict) -> tuple[bool, str]:
148+
149+
remote_size = remote_headers.get('Content-Length') or remote_headers.get('content-length')
150+
151+
if not remote_size:
152+
return False, 'method_unavailable'
153+
154+
try:
155+
remote_size = int(remote_size)
156+
local_size = os.path.getsize(local_path)
157+
is_current = local_size == remote_size
158+
_log(f'Size check: remote={remote_size}, local={local_size}, current={is_current}')
159+
return is_current, 'size match' if is_current else 'size mismatch'
160+
except (ValueError, OSError) as e:
161+
_log(f'Error checking size: {e}')
162+
return False, f'size check error: {e}'
163+
164+
165+
def _compute_hash(file_path: str | Path, algorithm: str = 'sha256') -> str:
166+
167+
h = hashlib.new(algorithm)
168+
with open(file_path, 'rb') as f:
169+
while chunk := f.read(8192):
170+
h.update(chunk)
171+
return h.hexdigest()
172+
173+
174+
def get_remote_headers(url: str, **kwargs) -> dict:
175+
176+
import requests
177+
178+
try:
179+
response = requests.head(url, allow_redirects=True, **kwargs)
180+
_log(f'HEAD request to {url}: status={response.status_code}')
181+
return dict(response.headers)
182+
except Exception as e:
183+
_log(f'Error getting remote headers: {e}')
184+
return {}

tests/test_freshness.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from pathlib import Path
2+
3+
from cache_manager import _freshness
4+
5+
6+
class _DummyItem:
7+
8+
def __init__(self, attrs):
9+
self.attrs = attrs
10+
11+
12+
def test_metadata_from_item_extracts_headers():
13+
14+
item = _DummyItem(
15+
attrs={
16+
'resp_headers': {
17+
'ETag': 'abc',
18+
'Last-Modified': 'Wed, 21 Oct 2015 07:28:00 GMT',
19+
},
20+
'sha256': 'x',
21+
'size': 10,
22+
}
23+
)
24+
25+
meta = _freshness.metadata_from_item(item)
26+
27+
assert meta['etag'] == 'abc'
28+
assert meta['last_modified'] == 'Wed, 21 Oct 2015 07:28:00 GMT'
29+
assert meta['sha256'] == 'x'
30+
assert meta['size'] == 10
31+
32+
33+
def test_check_freshness_size(tmp_path: Path):
34+
35+
path = tmp_path / 'a.txt'
36+
path.write_text('hello')
37+
38+
is_current, reason = _freshness.check_freshness(
39+
local_path=path,
40+
remote_headers={'Content-Length': str(path.stat().st_size)},
41+
local_metadata={},
42+
method='size',
43+
)
44+
45+
assert is_current is True
46+
assert reason == 'size match'

0 commit comments

Comments
 (0)