Skip to content

Commit 29377ab

Browse files
authored
Merge pull request #220 from hc-sc-ocdo-bdpd/hashing
Hashing and copy method on file class
2 parents 7ce13ff + 1baf8ca commit 29377ab

33 files changed

Lines changed: 1070 additions & 128 deletions

file_processing/file.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
NotTranscriptionApplicableError,
99
OptionalDependencyNotInstalledError
1010
)
11+
import shutil
1112

1213

1314
class File:
@@ -151,10 +152,30 @@ def save(self, output_path: str = None) -> None:
151152
Saves the processed file to the specified output path.
152153
153154
Args:
154-
output_path (str): The destination path for saving the processed file.
155+
output_path (str): Destination path (defaults to original file path).
155156
"""
156157
self.processor.save(output_path)
157158

159+
def copy(self, output_path: str, verify_integrity: bool = False) -> None:
160+
"""
161+
Copies the original file bit-for-bit.
162+
163+
Args:
164+
output_path (str): Destination path (defaults to original file path).
165+
verify_integrity (bool): If True, re-compute and compare hash after copy.
166+
"""
167+
original_hash = self.hash if verify_integrity else None
168+
src = self.processor.file_path
169+
shutil.copy2(str(src), str(output_path))
170+
171+
if verify_integrity:
172+
copied = File(str(output_path), open_file=False)
173+
if original_hash != copied.hash:
174+
raise FileProcessingFailedError(
175+
f"Integrity check failed on copy: {output_path} hash changed "
176+
f"({original_hash}{copied.hash})"
177+
)
178+
158179
def process(self) -> None:
159180
"""
160181
Executes the processing operation on the file.
@@ -183,23 +204,23 @@ def owner(self) -> str:
183204
return self.processor.owner
184205

185206
@property
186-
def size(self) -> str:
187-
"""str: Returns the size of the file in bytes."""
207+
def size(self) -> int:
208+
"""int: Returns the size of the file in bytes."""
188209
return self.processor.size
189210

190211
@property
191-
def modification_time(self) -> str:
192-
"""str: Returns the last modification time of the file."""
212+
def modification_time(self) -> float:
213+
"""float: Returns the last modification time of the file."""
193214
return self.processor.modification_time
194215

195216
@property
196-
def access_time(self) -> str:
197-
"""str: Returns the last access time of the file."""
217+
def access_time(self) -> float:
218+
"""float: Returns the last access time of the file."""
198219
return self.processor.access_time
199220

200221
@property
201-
def creation_time(self) -> str:
202-
"""str: Returns the creation time of the file."""
222+
def creation_time(self) -> float:
223+
"""float: Returns the creation time of the file."""
203224
return self.processor.creation_time
204225

205226
@property
@@ -226,6 +247,11 @@ def is_symlink(self) -> bool:
226247
def absolute_path(self) -> str:
227248
"""str: Returns the absolute path of the file."""
228249
return self.processor.absolute_path
250+
251+
@property
252+
def hash(self) -> str:
253+
"""str: Hex digest of this file's content (default SHA256)."""
254+
return self.processor.hash
229255

230256
@property
231257
def metadata(self) -> dict:

file_processing/file_processor_strategy.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from pathlib import Path
33
import sys
44
import importlib.util
5+
from hashlib import md5, sha256
56
from file_processing.errors import FileProcessingFailedError
67

78
class FileProcessorStrategy(ABC):
@@ -77,6 +78,47 @@ def _find_owner(self, file_path: str) -> str:
7778
return f'{domain}/{name}'
7879
return ''
7980

81+
@property
82+
def hash(self) -> str:
83+
"""
84+
Computes (if necessary) and returns the hash of the file content.
85+
86+
Returns:
87+
str: Hexadecimal hash of the file.
88+
"""
89+
if not hasattr(self, '_hash'):
90+
self._hash = self.compute_hash()
91+
return self._hash
92+
93+
def compute_hash(self, algorithm: str = 'sha256') -> str:
94+
"""
95+
Computes the hash of the file using the specified algorithm.
96+
97+
Args:
98+
algorithm (str, optional): The hashing algorithm to use ('sha256' or 'md5'). Defaults to 'sha256'.
99+
100+
Returns:
101+
str: Hexadecimal hash of the file.
102+
103+
Raises:
104+
ValueError: If an unsupported algorithm is specified.
105+
FileProcessingFailedError: If file hashing fails.
106+
"""
107+
hash_func = {'md5': md5, 'sha256': sha256}.get(algorithm)
108+
if not hash_func:
109+
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
110+
111+
hasher = hash_func()
112+
try:
113+
with open(self.file_path, 'rb') as f:
114+
for chunk in iter(lambda: f.read(8192), b''):
115+
hasher.update(chunk)
116+
return hasher.hexdigest()
117+
except Exception as e:
118+
raise FileProcessingFailedError(
119+
f"Error computing hash for {self.file_path}: {e}"
120+
)
121+
80122
@abstractmethod
81123
def process(self) -> None:
82124
"""
@@ -95,4 +137,5 @@ def save(self) -> None:
95137
"""
96138
Saves the processed file after any metadata or content modifications.
97139
This method must be implemented by subclasses to define save behavior.
98-
"""
140+
"""
141+
pass

tests/unit/test_audio_processor.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from unittest.mock import patch
22
from pathlib import Path
3+
import shutil
34
import pytest
45
from mutagen import File as MutagenFile
56
from mutagen.easyid3 import EasyID3
@@ -68,7 +69,6 @@ def test_save_audio_metadata(copy_file, bitrate, length):
6869
def test_change_audio_artist_title_date(copy_file, bitrate, length):
6970
audio_file = MutagenFile(copy_file)
7071
if isinstance(audio_file, (MP3, FLAC, OggVorbis, MP4)):
71-
# Change metadata via Document object
7272
audio_file = MutagenFile(copy_file)
7373
if isinstance(audio_file, MP3):
7474
audio_file = EasyID3(copy_file)
@@ -86,7 +86,6 @@ def test_change_audio_artist_title_date(copy_file, bitrate, length):
8686
audio_file.tags['\xa9day'] = "2023-11-22"
8787
audio_file.tags['\xa9nam'] = "New Title"
8888
audio_file.tags['\xa9wrk'] = "Health Canada"
89-
# Save the file
9089
audio_file.save()
9190
test_audio_metadata(copy_file, bitrate, length, 'New Artist',
9291
'2023-11-22', 'New Title', 'Health Canada')
@@ -101,6 +100,7 @@ def test_not_opening_file(path, bitrate, length, artist, date, title, organizati
101100
File(path, open_file=False)
102101
mock_open.assert_not_called()
103102

103+
104104
invalid_save_locations = [
105105
(test_files_path / 'sample_speech.mp3', '/non_existent_folder/sample_speech.mp3')
106106
]
@@ -122,3 +122,31 @@ def test_audio_invalid_save_location(path, save_path):
122122
def test_audio_corrupted_file_processing(path):
123123
with pytest.raises(FileProcessingFailedError):
124124
File(path)
125+
126+
127+
@pytest.mark.parametrize("path", [v[0] for v in values])
128+
@pytest.mark.parametrize("algorithm", ["md5", "sha256"])
129+
def test_audio_copy_with_integrity(path, algorithm, tmp_path):
130+
file_obj = File(path, open_file=False)
131+
original_hash = file_obj.processor.compute_hash(algorithm)
132+
133+
dest_path = tmp_path / Path(path).name
134+
file_obj.copy(str(dest_path), verify_integrity=True)
135+
136+
copied = File(str(dest_path))
137+
assert copied.processor.compute_hash(algorithm) == original_hash
138+
139+
140+
@pytest.mark.parametrize("path", [v[0] for v in values])
141+
def test_audio_copy_integrity_failure(path, tmp_path, monkeypatch):
142+
file_obj = File(path, open_file=False)
143+
144+
def corrupt(src, dest, *, follow_symlinks=True):
145+
with open(dest, 'w') as f:
146+
f.write("CORRUPTED!")
147+
148+
monkeypatch.setattr(shutil, "copy2", corrupt)
149+
150+
with pytest.raises(FileProcessingFailedError) as excinfo:
151+
file_obj.copy(str(tmp_path / Path(path).name), verify_integrity=True)
152+
assert "Integrity check failed" in str(excinfo.value)

tests/unit/test_cpp_processor.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
2+
import shutil
23
import pytest
4+
from pathlib import Path
35
from unittest.mock import patch
46
from file_processing.file import File
57
from file_processing.errors import FileProcessingFailedError
@@ -44,4 +46,31 @@ def test_cpp_processor_open_file_false(file_name):
4446
cpp_file_path = test_files_path / file_name
4547
with patch("builtins.open") as mock_open:
4648
File(str(cpp_file_path), open_file=False)
47-
mock_open.assert_not_called()
49+
mock_open.assert_not_called()
50+
51+
@pytest.mark.parametrize("file_name", [v[0] for v in values])
52+
@pytest.mark.parametrize("algorithm", ["md5", "sha256"])
53+
def test_cpp_copy_with_integrity(file_name, algorithm, tmp_path):
54+
src_path = test_files_path / file_name
55+
file_obj = File(str(src_path), open_file=False)
56+
original_hash = file_obj.processor.compute_hash(algorithm)
57+
58+
dest_path = tmp_path / file_name
59+
file_obj.copy(str(dest_path), verify_integrity=True)
60+
61+
copied_file = File(str(dest_path))
62+
assert copied_file.processor.compute_hash(algorithm) == original_hash
63+
64+
@pytest.mark.parametrize("file_name", [v[0] for v in values])
65+
def test_cpp_copy_integrity_failure(file_name, tmp_path, monkeypatch):
66+
file_obj = File(str(test_files_path / file_name), open_file=False)
67+
68+
def corrupt(src, dest, *, follow_symlinks=True):
69+
with open(dest, 'w') as f:
70+
f.write("CORRUPTED!")
71+
72+
monkeypatch.setattr(shutil, "copy2", corrupt)
73+
74+
with pytest.raises(FileProcessingFailedError) as excinfo:
75+
file_obj.copy(str(tmp_path / file_name), verify_integrity=True)
76+
assert "Integrity check failed" in str(excinfo.value)

tests/unit/test_csv_processor.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
import os
2+
from pathlib import Path
23
from unittest.mock import patch
34
import pytest
45
from file_processing import File
56
from file_processing.errors import FileProcessingFailedError
67
from file_processing_test_data import get_test_files_path
78

9+
# Data fixture
810
test_files_path = get_test_files_path()
9-
10-
variable_names = "path, text_length, encoding, num_rows, num_cols, num_cells, empty_cells"
11+
variable_names = (
12+
"path, text_length, encoding, num_rows, num_cols, num_cells, empty_cells"
13+
)
1114
values = [
12-
(test_files_path / '2021_Census_English.csv', 6084302, 'ISO-8859-1', 36835, 23, 847205, 253932),
13-
(test_files_path / 'Approved_Schools_2023_10_01.csv', 1403268, 'UTF-8-SIG', 5385, 13, 70005, 73)
15+
(test_files_path / '2021_Census_English.csv', 6084302, 'ISO-8859-1', 36835, 23, 847205, 253932),
16+
(test_files_path / 'Approved_Schools_2023_10_01.csv', 1403268, 'UTF-8-SIG', 5385, 13, 70005, 73)
1417
]
1518

16-
1719
@pytest.mark.parametrize(variable_names, values)
1820
def test_csv_metadata(path, text_length, encoding, num_rows, num_cols, num_cells, empty_cells):
1921
file_obj = File(path)
@@ -24,22 +26,58 @@ def test_csv_metadata(path, text_length, encoding, num_rows, num_cols, num_cells
2426
assert file_obj.metadata['num_cells'] == num_cells
2527
assert file_obj.metadata['empty_cells'] == empty_cells
2628

27-
2829
@pytest.mark.parametrize(variable_names, values)
2930
def test_save_csv_metadata(copy_file, text_length, encoding, num_rows, num_cols, num_cells, empty_cells):
31+
# Reuse metadata assertions on the saved file
3032
test_csv_metadata(copy_file, text_length, encoding, num_rows, num_cols, num_cells, empty_cells)
3133

32-
33-
@pytest.mark.parametrize("valid_path", [path for path, *_ in values])
34+
@pytest.mark.parametrize("valid_path", [p for p, *_ in values])
3435
def test_csv_invalid_save_location(valid_path):
3536
csv_file = File(valid_path)
36-
invalid_save_path = '/non_existent_folder/' + os.path.basename(valid_path)
37+
invalid_path = '/non_existent_folder/' + os.path.basename(str(valid_path))
3738
with pytest.raises(FileProcessingFailedError):
38-
csv_file.processor.save(invalid_save_path)
39-
39+
csv_file.processor.save(invalid_path)
4040

4141
@pytest.mark.parametrize(variable_names, values)
4242
def test_not_opening_file(path, text_length, encoding, num_rows, num_cols, num_cells, empty_cells):
4343
with patch('builtins.open', autospec=True) as mock_open:
4444
File(path, open_file=False)
4545
mock_open.assert_not_called()
46+
47+
@pytest.mark.parametrize(variable_names, values)
48+
@pytest.mark.parametrize("algorithm", ["md5", "sha256"])
49+
def test_csv_copy_with_integrity(path, tmp_path, algorithm,
50+
text_length, encoding, num_rows, num_cols, num_cells, empty_cells):
51+
# 1. Compute original hash (raw file)
52+
file_obj = File(path, open_file=False)
53+
original_hash = file_obj.processor.compute_hash(algorithm)
54+
55+
# 2. Copy with integrity verification
56+
dest = tmp_path / Path(path).name
57+
file_obj.copy(str(dest), verify_integrity=True)
58+
59+
# 3. Re-open and verify metadata unchanged
60+
saved = File(str(dest))
61+
assert len(saved.metadata['text']) == text_length
62+
assert saved.metadata['encoding'] == encoding
63+
assert saved.metadata['num_rows'] == num_rows
64+
assert saved.metadata['num_cols'] == num_cols
65+
assert saved.metadata['num_cells'] == num_cells
66+
assert saved.metadata['empty_cells'] == empty_cells
67+
68+
# 4. Verify hash matches for chosen algorithm
69+
assert saved.processor.compute_hash(algorithm) == original_hash
70+
71+
@pytest.mark.parametrize("path", [p for p, *_ in values])
72+
def test_csv_copy_integrity_failure(path, tmp_path, monkeypatch):
73+
file_obj = File(path, open_file=False)
74+
# Simulate corruption in raw copy
75+
import shutil
76+
def corrupt(src, dest, *, follow_symlinks=True):
77+
with open(dest, 'w') as f:
78+
f.write('CORRUPTED!')
79+
monkeypatch.setattr(shutil, 'copy2', corrupt)
80+
81+
with pytest.raises(FileProcessingFailedError) as excinfo:
82+
file_obj.copy(str(tmp_path / Path(path).name), verify_integrity=True)
83+
assert 'Integrity check failed' in str(excinfo.value)

0 commit comments

Comments
 (0)