Skip to content

Commit 2f047ea

Browse files
committed
segmenter: fix EcoTaxa import errors (contour length + lowercase prefix)
1 parent be4bf6a commit 2f047ea

2 files changed

Lines changed: 27 additions & 5 deletions

File tree

segmenter/planktoscope/segmenter/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -866,11 +866,19 @@ def segment_path(self, path, ecotaxa_export):
866866
"parameters": {"kernel_size": 8, "kernel_shape": "ellipse"},
867867
}
868868

869-
# Define the name of the .zip file that will contain the images and the .tsv table for EcoTaxa
869+
# Define the name of the .zip file that will contain the images and the .tsv table for EcoTaxa.
870+
# acq_id is built downstream of sample_id (the imager directory layout
871+
# produces acq_id = "<sample_id>_<suffix>"), so naively joining them
872+
# duplicates the sample_id; strip the redundant prefix when present.
873+
# Lowercase `ecotaxa_` prefix — EcoTaxa rejects archives starting with capital E.
874+
if acquisition.startswith(sample + "_"):
875+
acq_suffix = acquisition[len(sample) + 1 :]
876+
else:
877+
acq_suffix = acquisition
870878
self.__archive_fn = os.path.join(
871879
self.__ecotaxa_path,
872880
# TODO #102 sanitize the filename to remove potential problems with spaces and special characters
873-
f"Ecotaxa_{sample}_{acquisition}.zip",
881+
f"ecotaxa_{sample}_{acq_suffix}.zip",
874882
)
875883

876884
self.__working_path = path

segmenter/planktoscope/segmenter/ecotaxa.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -238,9 +238,16 @@ def ecotaxa_export(archive_filepath, metadata, image_base_path, keep_files=False
238238
return 0
239239

240240
# Concatenated sample+acq id for both TSV columns and the filename.
241+
# acq_id is built downstream of sample_id (the imager directory layout
242+
# produces acq_id = "<sample_id>_<suffix>"), so naively joining them
243+
# duplicates the sample_id; strip the redundant prefix when present.
241244
sample_id = metadata.get("sample_id", "unknown_sample").replace(" ", "_")
242245
acquisition_id = metadata.get("acq_id", "unknown_acq").replace(" ", "_")
243-
combined_id = f"{sample_id}_{acquisition_id}"
246+
if acquisition_id.startswith(sample_id + "_"):
247+
acq_suffix = acquisition_id[len(sample_id) + 1 :]
248+
else:
249+
acq_suffix = acquisition_id
250+
combined_id = f"{sample_id}_{acq_suffix}"
244251
metadata["sample_id"] = combined_id
245252
metadata["acq_id"] = combined_id
246253

@@ -254,7 +261,12 @@ def ecotaxa_export(archive_filepath, metadata, image_base_path, keep_files=False
254261
for rank, roi in enumerate(object_list, start=1):
255262
tsv_line = {}
256263
tsv_line.update(metadata)
257-
tsv_line.update(("object_" + k, v) for k, v in roi["metadata"].items())
264+
# Exclude `object_contour` from the TSV — EcoTaxa rejects fields > 250
265+
# chars and the polygon JSON exceeds that for moderately complex shapes.
266+
# The contour stays in the per-object metadata.json for the audit visualizer.
267+
tsv_line.update(
268+
("object_" + k, v) for k, v in roi["metadata"].items() if k != "contour"
269+
)
258270
tsv_line["object_id"] = roi["name"]
259271

260272
filename = roi["name"] + ".jpg"
@@ -276,7 +288,9 @@ def ecotaxa_export(archive_filepath, metadata, image_base_path, keep_files=False
276288
list(zip(tsv_content.columns, tsv_type_header))
277289
)
278290

279-
tsv_filename = f"Ecotaxa_{sample_id}_{acquisition_id}.tsv"
291+
# Lowercase `ecotaxa_` prefix — EcoTaxa rejects archives whose TSV starts
292+
# with a capital E.
293+
tsv_filename = f"ecotaxa_{combined_id}.tsv"
280294

281295
# add the tsv to the archive
282296
archive.writestr(

0 commit comments

Comments
 (0)