Skip to content

Commit cde4644

Browse files
Merge pull request #162 from unicef/feature/261155-datachecker-import-file-sheet-structure
AB#261155: Datachecker import file sheet structure
2 parents 4d7d1c4 + 6b50db4 commit cde4644

File tree

5 files changed

+1170
-1171
lines changed

5 files changed

+1170
-1171
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ dependencies = [
4545
"flower>=2.0.1",
4646
"hope-flex-fields>=0.6.2",
4747
"hope-smart-export>=0.3",
48-
"hope-smart-import>=0.4",
48+
"hope-smart-import>=0.5",
4949
"notebook>=7.4.3",
5050
"numpy>=2.3",
5151
"openpyxl>=3.1.5",

src/country_workspace/datasources/rdi.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import io
22
from base64 import b64encode
3-
from enum import Enum
43
from collections import defaultdict
54
from collections.abc import Iterable, Generator
5+
from enum import StrEnum
66
from typing import Any, Mapping, cast, NotRequired
77

88
import openpyxl
@@ -12,13 +12,12 @@
1212
from openpyxl.drawing.image import Image as RDIImage
1313

1414
from country_workspace.contrib.kobo.api.data.helpers import VALUE_FORMAT
15+
from country_workspace.datasources.utils import datetime_to_date, date_to_iso_string
1516
from country_workspace.models import AsyncJob, Batch, Household, Individual
1617
from country_workspace.utils.config import BatchNameConfig, FailIfAlienConfig
1718
from country_workspace.utils.fields import Record, clean_field_names
1819
from country_workspace.utils.functional import compose
1920
from country_workspace.validators.beneficiaries import validate_beneficiaries
20-
from country_workspace.datasources.utils import datetime_to_date, date_to_iso_string
21-
2221

2322
RDI = str | io.BytesIO
2423
Sheet = Iterable[Record]
@@ -39,10 +38,10 @@ class Config(BatchNameConfig, FailIfAlienConfig):
3938
first_line: int
4039

4140

42-
class SheetName(Enum):
43-
HOUSEHOLDS: int = 0
44-
INDIVIDUALS: int = 1
45-
PEOPLE: int = 2
41+
class SheetName(StrEnum):
42+
HOUSEHOLDS = "Households"
43+
INDIVIDUALS = "Individuals"
44+
PEOPLE = "People"
4645

4746

4847
class ColumnConfigurationError(Exception):
@@ -65,16 +64,16 @@ def __str__(self) -> str:
6564

6665

6766
class SheetNotFoundError(Exception):
68-
def __init__(self, sheet_indices: int | tuple[int, ...]) -> None:
69-
if isinstance(sheet_indices, int):
70-
sheet_indices = (sheet_indices,)
71-
super().__init__(sheet_indices)
72-
self.sheet_indices = sheet_indices
67+
def __init__(self, sheet_names: str | tuple[str, ...]) -> None:
68+
if isinstance(sheet_names, str):
69+
sheet_names = (sheet_names,)
70+
super().__init__(sheet_names)
71+
self.sheet_names = sheet_names
7372

7473
def __str__(self) -> str:
75-
if len(self.sheet_indices) == 1:
76-
return f"Sheet with index {self.sheet_indices[0]} was not found in the provided file."
77-
indices_str = ", ".join(map(str, self.sheet_indices))
74+
if len(self.sheet_names) == 1:
75+
return f"Sheet with index {self.sheet_names[0]} was not found in the provided file."
76+
indices_str = ", ".join(map(str, self.sheet_names))
7877
return f"Sheets with indices {indices_str} were not found in the provided file."
7978

8079

@@ -175,10 +174,10 @@ def image_content(rdi_image: RDIImage) -> tuple[str | None, str]:
175174
return content_type, content
176175

177176

178-
def extract_images(filepath: str, *sheet_indices: int) -> Generator[Mapping[int, Mapping[int, str]], None, None]:
177+
def extract_images(filepath: str, *sheet_names: str) -> Generator[Mapping[int, Mapping[int, str]], None, None]:
179178
workbook = openpyxl.load_workbook(filepath)
180-
for i in sheet_indices:
181-
worksheet = workbook.worksheets[i]
179+
for n in sheet_names:
180+
worksheet = workbook[n]
182181
images: dict[int, dict[int, str]] = defaultdict(dict)
183182
for rdi_image in worksheet._images:
184183
row, column = image_location(rdi_image)
@@ -195,19 +194,19 @@ def merge_images(sheet: Sheet, sheet_images: Mapping[int, Mapping[int, str]]) ->
195194
yield row
196195

197196

198-
def read_sheets(config: Config, filepath: str, *sheet_indices: int) -> Generator[Sheet, None, None]:
197+
def read_sheets(config: Config, filepath: str, *sheet_names: str) -> Generator[Sheet, None, None]:
199198
cell_mapper = compose(datetime_to_date, date_to_iso_string)
200199
try:
201-
sheets = open_xls_multi(filepath, sheets=list(sheet_indices), value_mapper=cell_mapper)
202-
sheet_images = extract_images(filepath, *sheet_indices)
200+
sheets = open_xls_multi(filepath, indices_or_names=list(sheet_names), value_mapper=cell_mapper)
201+
sheet_images = extract_images(filepath, *sheet_names)
203202
for (_, sheet), images in zip(sheets, sheet_images, strict=False):
204203
sheet_with_images = merge_images(sheet, images)
205204
if config["master_detail"]:
206205
yield filter_rows_with_household_pk(config, sheet_with_images)
207206
else:
208207
yield sheet_with_images
209208
except IndexError as e:
210-
raise SheetNotFoundError(sheet_indices) from e
209+
raise SheetNotFoundError(sheet_names) from e
211210

212211

213212
def import_from_rdi(job: AsyncJob) -> dict[str, int]:
@@ -226,16 +225,14 @@ def import_from_rdi(job: AsyncJob) -> dict[str, int]:
226225

227226

228227
def _import_master_detail(job: AsyncJob, batch: Batch, config: dict) -> dict[str, int]:
229-
household_sheet, individual_sheet = read_sheets(
230-
config, job.file, SheetName.HOUSEHOLDS.value, SheetName.INDIVIDUALS.value
231-
)
228+
household_sheet, individual_sheet = read_sheets(config, job.file, SheetName.HOUSEHOLDS, SheetName.INDIVIDUALS)
232229
household_mapping = process_households(household_sheet, job, batch, config)
233230
individuals_mapping = process_beneficiaries(individual_sheet, job, batch, config, household_mapping)
234231
validate_beneficiaries(config, household_mapping)
235232
return {"household": len(household_mapping), "individual": len(individuals_mapping)}
236233

237234

238235
def _import_people_only(job: AsyncJob, batch: Batch, config: dict) -> dict[str, int]:
239-
(people_sheet,) = read_sheets(config, job.file, SheetName.PEOPLE.value)
236+
(people_sheet,) = read_sheets(config, job.file, SheetName.PEOPLE)
240237
validate_beneficiaries(config, people_mapping := process_beneficiaries(people_sheet, job, batch, config))
241238
return {"people": len(people_mapping)}

src/country_workspace/workspaces/admin/cleaners/bulk_update.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def bulk_update_collection(job: AsyncJob, collection_getter: Callable[[int], Any
198198
errors = {}
199199

200200
file_data = job.file.read()
201-
rows = open_xls(io.BytesIO(file_data), start_at=0)
201+
rows = open_xls(io.BytesIO(file_data), start_at_row=0)
202202
for line_number, row in enumerate(rows, start=1):
203203
try:
204204
_id = int(row.pop("id"))

tests/datasources/test_rdi.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,11 @@ def test_household_validation_error_format() -> None:
128128

129129

130130
def test_sheet_not_found_error_format() -> None:
131-
error = SheetNotFoundError(sheet_idx := 99)
132-
assert str(sheet_idx) in str(error)
131+
error = SheetNotFoundError(sheet_name := "first")
132+
assert sheet_name in str(error)
133133

134-
error_multiple = SheetNotFoundError(sheet_indices := [0, 1, 99])
135-
for idx in sheet_indices:
134+
error_multiple = SheetNotFoundError(sheet_names := ("first", "second"))
135+
for idx in sheet_names:
136136
assert str(idx) in str(error_multiple)
137137

138138

@@ -388,13 +388,13 @@ def test_extract_images(mocker: MockerFixture) -> None:
388388
image_content_mock = mocker.patch("country_workspace.datasources.rdi.image_content")
389389
image_content_mock.return_value = (content_type := "content/type", content := "content")
390390
image = MagicMock()
391-
load_workbook_mock.return_value.worksheets.__getitem__.return_value._images = (image,)
391+
load_workbook_mock.return_value.__getitem__.return_value._images = (image,)
392392

393-
result = list(extract_images(filepath := "test", sheet_index := 0))
393+
result = list(extract_images(filepath := "test", sheet_name := "first"))
394394

395395
assert result == [{row - 1: {column: VALUE_FORMAT.format(mimetype=content_type, content=content)}}]
396396
load_workbook_mock.assert_called_once_with(filepath)
397-
load_workbook_mock.return_value.worksheets.__getitem__.assert_called_once_with(sheet_index)
397+
load_workbook_mock.return_value.__getitem__.assert_called_once_with(sheet_name)
398398
image_location_mock.assert_called_once_with(image)
399399
image_content_mock.assert_called_once_with(image)
400400

@@ -423,14 +423,14 @@ def test_read_sheets(mocker: MockerFixture, config: Config) -> None:
423423
merge_images_mock = mocker.patch("country_workspace.datasources.rdi.merge_images")
424424

425425
filepath = "test"
426-
sheet_index = 0
426+
sheet_name = "first"
427427

428428
if config["master_detail"]:
429429
filter_rows_with_household_pk_mock = mocker.patch(
430430
"country_workspace.datasources.rdi.filter_rows_with_household_pk"
431431
)
432432

433-
result = list(read_sheets(config, filepath, sheet_index))
433+
result = list(read_sheets(config, filepath, sheet_name))
434434

435435
if config["master_detail"]:
436436
assert result == [filter_rows_with_household_pk_mock.return_value]
@@ -439,8 +439,10 @@ def test_read_sheets(mocker: MockerFixture, config: Config) -> None:
439439
assert result == [merge_images_mock.return_value]
440440

441441
compose_mock.assert_called_once_with(datetime_to_date_mock, date_to_iso_string_mock)
442-
open_xls_multi_mock.assert_called_once_with(filepath, sheets=[sheet_index], value_mapper=compose_mock.return_value)
443-
extract_images_mock.assert_called_once_with(filepath, sheet_index)
442+
open_xls_multi_mock.assert_called_once_with(
443+
filepath, indices_or_names=[sheet_name], value_mapper=compose_mock.return_value
444+
)
445+
extract_images_mock.assert_called_once_with(filepath, sheet_name)
444446
merge_images_mock.assert_called_once_with(sheet, images)
445447

446448

@@ -453,13 +455,13 @@ def test_read_sheets_sheet_not_found_error(mocker: MockerFixture, config: Config
453455
mocker.patch("country_workspace.datasources.rdi.extract_images")
454456

455457
filepath = "test"
456-
sheet_index = 99
458+
sheet_name = "first"
457459

458460
with pytest.raises(SheetNotFoundError) as exc_info:
459-
list(read_sheets(config, filepath, sheet_index))
461+
list(read_sheets(config, filepath, sheet_name))
460462

461-
assert exc_info.value.sheet_indices == (sheet_index,)
462-
assert str(sheet_index) in str(exc_info.value)
463+
assert exc_info.value.sheet_names == (sheet_name,)
464+
assert str(sheet_name) in str(exc_info.value)
463465

464466

465467
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)