Merge pull request #162 from unicef/feature/261155-datachecker-import-file-sheet-structure

sergey-misuk-valor · web-flow · commit cde4644ef623 · 2025-07-09T16:34:36.000+03:00
AB#261155: Datachecker import file sheet structure
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ dependencies = [
   "flower>=2.0.1",
   "hope-flex-fields>=0.6.2",
   "hope-smart-export>=0.3",
-  "hope-smart-import>=0.4",
+  "hope-smart-import>=0.5",
   "notebook>=7.4.3",
   "numpy>=2.3",
   "openpyxl>=3.1.5",
diff --git a/src/country_workspace/datasources/rdi.py b/src/country_workspace/datasources/rdi.py
@@ -1,8 +1,8 @@
 import io
 from base64 import b64encode
-from enum import Enum
 from collections import defaultdict
 from collections.abc import Iterable, Generator
+from enum import StrEnum
 from typing import Any, Mapping, cast, NotRequired
 
 import openpyxl
@@ -12,13 +12,12 @@
 from openpyxl.drawing.image import Image as RDIImage
 
 from country_workspace.contrib.kobo.api.data.helpers import VALUE_FORMAT
+from country_workspace.datasources.utils import datetime_to_date, date_to_iso_string
 from country_workspace.models import AsyncJob, Batch, Household, Individual
 from country_workspace.utils.config import BatchNameConfig, FailIfAlienConfig
 from country_workspace.utils.fields import Record, clean_field_names
 from country_workspace.utils.functional import compose
 from country_workspace.validators.beneficiaries import validate_beneficiaries
-from country_workspace.datasources.utils import datetime_to_date, date_to_iso_string
-
 
 RDI = str | io.BytesIO
 Sheet = Iterable[Record]
@@ -39,10 +38,10 @@ class Config(BatchNameConfig, FailIfAlienConfig):
     first_line: int
 
 
-class SheetName(Enum):
-    HOUSEHOLDS: int = 0
-    INDIVIDUALS: int = 1
-    PEOPLE: int = 2
+class SheetName(StrEnum):
+    HOUSEHOLDS = "Households"
+    INDIVIDUALS = "Individuals"
+    PEOPLE = "People"
 
 
 class ColumnConfigurationError(Exception):
@@ -65,16 +64,16 @@ def __str__(self) -> str:
 
 
 class SheetNotFoundError(Exception):
-    def __init__(self, sheet_indices: int | tuple[int, ...]) -> None:
-        if isinstance(sheet_indices, int):
-            sheet_indices = (sheet_indices,)
-        super().__init__(sheet_indices)
-        self.sheet_indices = sheet_indices
+    def __init__(self, sheet_names: str | tuple[str, ...]) -> None:
+        if isinstance(sheet_names, str):
+            sheet_names = (sheet_names,)
+        super().__init__(sheet_names)
+        self.sheet_names = sheet_names
 
     def __str__(self) -> str:
-        if len(self.sheet_indices) == 1:
-            return f"Sheet with index {self.sheet_indices[0]} was not found in the provided file."
-        indices_str = ", ".join(map(str, self.sheet_indices))
+        if len(self.sheet_names) == 1:
+            return f"Sheet with index {self.sheet_names[0]} was not found in the provided file."
+        indices_str = ", ".join(map(str, self.sheet_names))
         return f"Sheets with indices {indices_str} were not found in the provided file."
 
 
@@ -175,10 +174,10 @@ def image_content(rdi_image: RDIImage) -> tuple[str | None, str]:
     return content_type, content
 
 
-def extract_images(filepath: str, *sheet_indices: int) -> Generator[Mapping[int, Mapping[int, str]], None, None]:
+def extract_images(filepath: str, *sheet_names: str) -> Generator[Mapping[int, Mapping[int, str]], None, None]:
     workbook = openpyxl.load_workbook(filepath)
-    for i in sheet_indices:
-        worksheet = workbook.worksheets[i]
+    for n in sheet_names:
+        worksheet = workbook[n]
         images: dict[int, dict[int, str]] = defaultdict(dict)
         for rdi_image in worksheet._images:
             row, column = image_location(rdi_image)
@@ -195,19 +194,19 @@ def merge_images(sheet: Sheet, sheet_images: Mapping[int, Mapping[int, str]]) ->
             yield row
 
 
-def read_sheets(config: Config, filepath: str, *sheet_indices: int) -> Generator[Sheet, None, None]:
+def read_sheets(config: Config, filepath: str, *sheet_names: str) -> Generator[Sheet, None, None]:
     cell_mapper = compose(datetime_to_date, date_to_iso_string)
     try:
-        sheets = open_xls_multi(filepath, sheets=list(sheet_indices), value_mapper=cell_mapper)
-        sheet_images = extract_images(filepath, *sheet_indices)
+        sheets = open_xls_multi(filepath, indices_or_names=list(sheet_names), value_mapper=cell_mapper)
+        sheet_images = extract_images(filepath, *sheet_names)
         for (_, sheet), images in zip(sheets, sheet_images, strict=False):
             sheet_with_images = merge_images(sheet, images)
             if config["master_detail"]:
                 yield filter_rows_with_household_pk(config, sheet_with_images)
             else:
                 yield sheet_with_images
     except IndexError as e:
-        raise SheetNotFoundError(sheet_indices) from e
+        raise SheetNotFoundError(sheet_names) from e
 
 
 def import_from_rdi(job: AsyncJob) -> dict[str, int]:
@@ -226,16 +225,14 @@ def import_from_rdi(job: AsyncJob) -> dict[str, int]:
 
 
 def _import_master_detail(job: AsyncJob, batch: Batch, config: dict) -> dict[str, int]:
-    household_sheet, individual_sheet = read_sheets(
-        config, job.file, SheetName.HOUSEHOLDS.value, SheetName.INDIVIDUALS.value
-    )
+    household_sheet, individual_sheet = read_sheets(config, job.file, SheetName.HOUSEHOLDS, SheetName.INDIVIDUALS)
     household_mapping = process_households(household_sheet, job, batch, config)
     individuals_mapping = process_beneficiaries(individual_sheet, job, batch, config, household_mapping)
     validate_beneficiaries(config, household_mapping)
     return {"household": len(household_mapping), "individual": len(individuals_mapping)}
 
 
 def _import_people_only(job: AsyncJob, batch: Batch, config: dict) -> dict[str, int]:
-    (people_sheet,) = read_sheets(config, job.file, SheetName.PEOPLE.value)
+    (people_sheet,) = read_sheets(config, job.file, SheetName.PEOPLE)
     validate_beneficiaries(config, people_mapping := process_beneficiaries(people_sheet, job, batch, config))
     return {"people": len(people_mapping)}
diff --git a/src/country_workspace/workspaces/admin/cleaners/bulk_update.py b/src/country_workspace/workspaces/admin/cleaners/bulk_update.py
@@ -198,7 +198,7 @@ def bulk_update_collection(job: AsyncJob, collection_getter: Callable[[int], Any
     errors = {}
 
     file_data = job.file.read()
-    rows = open_xls(io.BytesIO(file_data), start_at=0)
+    rows = open_xls(io.BytesIO(file_data), start_at_row=0)
     for line_number, row in enumerate(rows, start=1):
         try:
             _id = int(row.pop("id"))
diff --git a/tests/datasources/test_rdi.py b/tests/datasources/test_rdi.py
@@ -128,11 +128,11 @@ def test_household_validation_error_format() -> None:
 
 
 def test_sheet_not_found_error_format() -> None:
-    error = SheetNotFoundError(sheet_idx := 99)
-    assert str(sheet_idx) in str(error)
+    error = SheetNotFoundError(sheet_name := "first")
+    assert sheet_name in str(error)
 
-    error_multiple = SheetNotFoundError(sheet_indices := [0, 1, 99])
-    for idx in sheet_indices:
+    error_multiple = SheetNotFoundError(sheet_names := ("first", "second"))
+    for idx in sheet_names:
         assert str(idx) in str(error_multiple)
 
 
@@ -388,13 +388,13 @@ def test_extract_images(mocker: MockerFixture) -> None:
     image_content_mock = mocker.patch("country_workspace.datasources.rdi.image_content")
     image_content_mock.return_value = (content_type := "content/type", content := "content")
     image = MagicMock()
-    load_workbook_mock.return_value.worksheets.__getitem__.return_value._images = (image,)
+    load_workbook_mock.return_value.__getitem__.return_value._images = (image,)
 
-    result = list(extract_images(filepath := "test", sheet_index := 0))
+    result = list(extract_images(filepath := "test", sheet_name := "first"))
 
     assert result == [{row - 1: {column: VALUE_FORMAT.format(mimetype=content_type, content=content)}}]
     load_workbook_mock.assert_called_once_with(filepath)
-    load_workbook_mock.return_value.worksheets.__getitem__.assert_called_once_with(sheet_index)
+    load_workbook_mock.return_value.__getitem__.assert_called_once_with(sheet_name)
     image_location_mock.assert_called_once_with(image)
     image_content_mock.assert_called_once_with(image)
 
@@ -423,14 +423,14 @@ def test_read_sheets(mocker: MockerFixture, config: Config) -> None:
     merge_images_mock = mocker.patch("country_workspace.datasources.rdi.merge_images")
 
     filepath = "test"
-    sheet_index = 0
+    sheet_name = "first"
 
     if config["master_detail"]:
         filter_rows_with_household_pk_mock = mocker.patch(
             "country_workspace.datasources.rdi.filter_rows_with_household_pk"
         )
 
-    result = list(read_sheets(config, filepath, sheet_index))
+    result = list(read_sheets(config, filepath, sheet_name))
 
     if config["master_detail"]:
         assert result == [filter_rows_with_household_pk_mock.return_value]
@@ -439,8 +439,10 @@ def test_read_sheets(mocker: MockerFixture, config: Config) -> None:
         assert result == [merge_images_mock.return_value]
 
     compose_mock.assert_called_once_with(datetime_to_date_mock, date_to_iso_string_mock)
-    open_xls_multi_mock.assert_called_once_with(filepath, sheets=[sheet_index], value_mapper=compose_mock.return_value)
-    extract_images_mock.assert_called_once_with(filepath, sheet_index)
+    open_xls_multi_mock.assert_called_once_with(
+        filepath, indices_or_names=[sheet_name], value_mapper=compose_mock.return_value
+    )
+    extract_images_mock.assert_called_once_with(filepath, sheet_name)
     merge_images_mock.assert_called_once_with(sheet, images)
 
 
@@ -453,13 +455,13 @@ def test_read_sheets_sheet_not_found_error(mocker: MockerFixture, config: Config
     mocker.patch("country_workspace.datasources.rdi.extract_images")
 
     filepath = "test"
-    sheet_index = 99
+    sheet_name = "first"
 
     with pytest.raises(SheetNotFoundError) as exc_info:
-        list(read_sheets(config, filepath, sheet_index))
+        list(read_sheets(config, filepath, sheet_name))
 
-    assert exc_info.value.sheet_indices == (sheet_index,)
-    assert str(sheet_index) in str(exc_info.value)
+    assert exc_info.value.sheet_names == (sheet_name,)
+    assert str(sheet_name) in str(exc_info.value)
 
 
 @pytest.mark.parametrize(
diff --git a/uv.lock b/uv.lock