|
| 1 | +""" |
| 2 | +Freshness checking utilities for cache-backed downloads. |
| 3 | +""" |
| 4 | + |
| 5 | +from __future__ import annotations |
| 6 | + |
| 7 | +import os |
| 8 | +import hashlib |
| 9 | +from email.utils import parsedate_to_datetime |
| 10 | +from typing import TYPE_CHECKING |
| 11 | + |
| 12 | +if TYPE_CHECKING: |
| 13 | + from pathlib import Path |
| 14 | + |
| 15 | +from cache_manager._session import _log |
| 16 | + |
| 17 | + |
| 18 | +def metadata_from_item(item) -> dict: |
| 19 | + """ |
| 20 | + Extracts freshness-relevant metadata from a CacheItem. |
| 21 | + """ |
| 22 | + |
| 23 | + attrs = getattr(item, 'attrs', {}) or {} |
| 24 | + headers = attrs.get('resp_headers', {}) or {} |
| 25 | + |
| 26 | + return { |
| 27 | + 'etag': attrs.get('etag') or headers.get('ETag') or headers.get('etag'), |
| 28 | + 'last_modified': ( |
| 29 | + attrs.get('last_modified') or |
| 30 | + headers.get('Last-Modified') or |
| 31 | + headers.get('last-modified') |
| 32 | + ), |
| 33 | + 'sha256': attrs.get('sha256'), |
| 34 | + 'size': attrs.get('size'), |
| 35 | + } |
| 36 | + |
| 37 | + |
| 38 | +def check_freshness( |
| 39 | + local_path: str | Path, |
| 40 | + remote_headers: dict, |
| 41 | + local_metadata: dict | None = None, |
| 42 | + method: str = 'auto', |
| 43 | +) -> tuple[bool, str]: |
| 44 | + |
| 45 | + if not os.path.exists(local_path): |
| 46 | + return False, 'local file does not exist' |
| 47 | + |
| 48 | + local_metadata = local_metadata or {} |
| 49 | + |
| 50 | + if method == 'auto': |
| 51 | + for check_method in ('etag', 'modified', 'size'): |
| 52 | + is_current, reason = _check_by_method( |
| 53 | + local_path, |
| 54 | + remote_headers, |
| 55 | + local_metadata, |
| 56 | + check_method, |
| 57 | + ) |
| 58 | + if reason != 'method_unavailable': |
| 59 | + return is_current, f'{check_method}: {reason}' |
| 60 | + return False, 'no check method available' |
| 61 | + |
| 62 | + return _check_by_method(local_path, remote_headers, local_metadata, method) |
| 63 | + |
| 64 | + |
| 65 | +def _check_by_method( |
| 66 | + local_path: str | Path, |
| 67 | + remote_headers: dict, |
| 68 | + local_metadata: dict, |
| 69 | + method: str, |
| 70 | +) -> tuple[bool, str]: |
| 71 | + |
| 72 | + if method == 'etag': |
| 73 | + return _check_etag(remote_headers, local_metadata) |
| 74 | + if method == 'modified': |
| 75 | + return _check_last_modified(remote_headers, local_metadata) |
| 76 | + if method == 'hash': |
| 77 | + return _check_hash(local_path, remote_headers, local_metadata) |
| 78 | + if method == 'size': |
| 79 | + return _check_size(local_path, remote_headers) |
| 80 | + |
| 81 | + return False, f'unknown method: {method}' |
| 82 | + |
| 83 | + |
| 84 | +def _check_etag(remote_headers: dict, local_metadata: dict) -> tuple[bool, str]: |
| 85 | + |
| 86 | + remote_etag = remote_headers.get('ETag') or remote_headers.get('etag') |
| 87 | + local_etag = local_metadata.get('etag') |
| 88 | + |
| 89 | + if not remote_etag: |
| 90 | + return False, 'method_unavailable' |
| 91 | + if not local_etag: |
| 92 | + return False, 'no local etag stored' |
| 93 | + |
| 94 | + is_current = remote_etag == local_etag |
| 95 | + _log(f'ETag check: remote={remote_etag}, local={local_etag}, current={is_current}') |
| 96 | + return is_current, 'etag match' if is_current else 'etag mismatch' |
| 97 | + |
| 98 | + |
| 99 | +def _check_last_modified( |
| 100 | + remote_headers: dict, |
| 101 | + local_metadata: dict, |
| 102 | +) -> tuple[bool, str]: |
| 103 | + |
| 104 | + remote_modified = ( |
| 105 | + remote_headers.get('Last-Modified') or |
| 106 | + remote_headers.get('last-modified') |
| 107 | + ) |
| 108 | + local_modified = local_metadata.get('last_modified') |
| 109 | + |
| 110 | + if not remote_modified: |
| 111 | + return False, 'method_unavailable' |
| 112 | + if not local_modified: |
| 113 | + return False, 'no local last-modified stored' |
| 114 | + |
| 115 | + try: |
| 116 | + remote_dt = parsedate_to_datetime(remote_modified) |
| 117 | + local_dt = parsedate_to_datetime(local_modified) |
| 118 | + is_current = local_dt >= remote_dt |
| 119 | + _log(f'Last-Modified check: remote={remote_dt}, local={local_dt}, current={is_current}') |
| 120 | + return is_current, 'not modified' if is_current else 'modified' |
| 121 | + except (ValueError, TypeError) as e: |
| 122 | + _log(f'Error parsing dates: {e}') |
| 123 | + return False, f'date parse error: {e}' |
| 124 | + |
| 125 | + |
| 126 | +def _check_hash( |
| 127 | + local_path: str | Path, |
| 128 | + remote_headers: dict, |
| 129 | + local_metadata: dict, |
| 130 | +) -> tuple[bool, str]: |
| 131 | + |
| 132 | + remote_md5 = remote_headers.get('Content-MD5') or remote_headers.get('content-md5') |
| 133 | + |
| 134 | + if remote_md5: |
| 135 | + local_md5 = _compute_hash(local_path, 'md5') |
| 136 | + is_current = remote_md5 == local_md5 |
| 137 | + _log(f'MD5 check: remote={remote_md5}, local={local_md5}, current={is_current}') |
| 138 | + return is_current, 'md5 match' if is_current else 'md5 mismatch' |
| 139 | + |
| 140 | + local_sha256 = local_metadata.get('sha256') |
| 141 | + if local_sha256: |
| 142 | + return False, 'hash check requires download' |
| 143 | + |
| 144 | + return False, 'method_unavailable' |
| 145 | + |
| 146 | + |
| 147 | +def _check_size(local_path: str | Path, remote_headers: dict) -> tuple[bool, str]: |
| 148 | + |
| 149 | + remote_size = remote_headers.get('Content-Length') or remote_headers.get('content-length') |
| 150 | + |
| 151 | + if not remote_size: |
| 152 | + return False, 'method_unavailable' |
| 153 | + |
| 154 | + try: |
| 155 | + remote_size = int(remote_size) |
| 156 | + local_size = os.path.getsize(local_path) |
| 157 | + is_current = local_size == remote_size |
| 158 | + _log(f'Size check: remote={remote_size}, local={local_size}, current={is_current}') |
| 159 | + return is_current, 'size match' if is_current else 'size mismatch' |
| 160 | + except (ValueError, OSError) as e: |
| 161 | + _log(f'Error checking size: {e}') |
| 162 | + return False, f'size check error: {e}' |
| 163 | + |
| 164 | + |
| 165 | +def _compute_hash(file_path: str | Path, algorithm: str = 'sha256') -> str: |
| 166 | + |
| 167 | + h = hashlib.new(algorithm) |
| 168 | + with open(file_path, 'rb') as f: |
| 169 | + while chunk := f.read(8192): |
| 170 | + h.update(chunk) |
| 171 | + return h.hexdigest() |
| 172 | + |
| 173 | + |
| 174 | +def get_remote_headers(url: str, **kwargs) -> dict: |
| 175 | + |
| 176 | + import requests |
| 177 | + |
| 178 | + try: |
| 179 | + response = requests.head(url, allow_redirects=True, **kwargs) |
| 180 | + _log(f'HEAD request to {url}: status={response.status_code}') |
| 181 | + return dict(response.headers) |
| 182 | + except Exception as e: |
| 183 | + _log(f'Error getting remote headers: {e}') |
| 184 | + return {} |
0 commit comments