Skip to content

Commit 5fb5743

Browse files
Merge pull request #1265 from Sage-Bionetworks/develop-fix-gs-col-mismatch-FDS-675
Fix Google Sheet column info mismatch FDS-675
2 parents 9959ea1 + 1e6edfe commit 5fb5743

File tree

1 file changed

+38
-8
lines changed

1 file changed

+38
-8
lines changed

schematic/manifest/generator.py

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,8 @@ def set_dataframe_by_url(
12991299
start_col = self._column_to_letter(len(manifest_df.columns) - num_out_of_schema_columns) # find start of out of schema columns
13001300
end_col = self._column_to_letter(len(manifest_df.columns) + 1) # find end of out of schema columns
13011301
wb.set_data_validation(start = start_col, end = end_col, condition_type = None)
1302+
1303+
13021304
# set permissions so that anyone with the link can edit
13031305
sh.share("", role="writer", type="anyone")
13041306

@@ -1463,7 +1465,7 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st
14631465
return output_file_path
14641466

14651467
# Return google sheet if sheet_url flag is raised.
1466-
elif sheet_url:
1468+
elif sheet_url:
14671469
manifest_sh = self.set_dataframe_by_url(manifest_url=empty_manifest_url, manifest_df=dataframe, out_of_schema_columns=out_of_schema_columns)
14681470
return manifest_sh.url
14691471

@@ -1521,7 +1523,6 @@ def get_manifest(
15211523
if manifest_record:
15221524
# TODO: Update or remove the warning in self.__init__() if
15231525
# you change the behavior here based on self.use_annotations
1524-
15251526
# Update df with existing manifest. Agnostic to output format
15261527
updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_record[1])
15271528

@@ -1555,7 +1556,6 @@ def get_manifest(
15551556

15561557
# Update `additional_metadata` and generate manifest
15571558
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations)
1558-
15591559
# Update df with existing manifest. Agnostic to output format
15601560
updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_df)
15611561

@@ -1564,10 +1564,35 @@ def get_manifest(
15641564
output_path = output_path,
15651565
sheet_url = sheet_url,
15661566
empty_manifest_url=empty_manifest_url,
1567-
dataframe = manifest_df,
1567+
dataframe = updated_df,
1568+
out_of_schema_columns = out_of_schema_columns,
15681569
)
15691570
return result
15701571

1572+
def _get_end_columns(self, current_schema_headers, existing_manifest_headers, out_of_schema_columns):
1573+
"""
1574+
Gather columns to be added to the end of the manifest, and ensure entityId is at the end.
1575+
Args:
1576+
current_schema_headers: list, columns in the current manifest schema
1577+
existing_manifest_headers: list, columns in the existing manifest
1578+
out_of_schema_columns: set, columns that are in the existing manifest, but not the current schema
1579+
Returns:
1580+
end_columns: list of columns to be added to the end of the manifest.
1581+
"""
1582+
# Identify columns to add to the end of the manifest
1583+
end_columns = list(out_of_schema_columns)
1584+
1585+
# Make sure want Ids are placed at end of manifest, in given order.
1586+
for id_name in ['Uuid', 'Id', 'entityId']:
1587+
if id_name in end_columns:
1588+
end_columns.remove(id_name)
1589+
end_columns.append(id_name)
1590+
1591+
# Add entity_id to the end columns if it should be there but isn't
1592+
if 'entityId' in (current_schema_headers or existing_manfiest_headers) and 'entityId' not in end_columns:
1593+
end_columns.append('entityId')
1594+
return end_columns
1595+
15711596
def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_df: pd.DataFrame) -> pd.DataFrame:
15721597
"""
15731598
Handle scenario when existing manifest does not match new manifest template due to changes in the data model:
@@ -1585,13 +1610,13 @@ def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_d
15851610

15861611
# Get headers for the current schema and existing manifest df.
15871612
current_schema_headers = list(self.get_dataframe_by_url(empty_manifest_url).columns)
1588-
existing_manfiest_headers = list(existing_df.columns)
1613+
existing_manifest_headers = list(existing_df.columns)
15891614

15901615
# Find columns that exist in the current schema, but are not in the manifest being downloaded.
1591-
new_columns = self._get_missing_columns(current_schema_headers, existing_manfiest_headers)
1616+
new_columns = self._get_missing_columns(current_schema_headers, existing_manifest_headers)
15921617

15931618
# Find columns that exist in the manifest being downloaded, but not in the current schema.
1594-
out_of_schema_columns = self._get_missing_columns(existing_manfiest_headers, current_schema_headers)
1619+
out_of_schema_columns = self._get_missing_columns(existing_manifest_headers, current_schema_headers)
15951620

15961621
# clean empty columns if any are present (there should be none)
15971622
# TODO: Remove this line once we start preventing empty column names
@@ -1607,12 +1632,17 @@ def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_d
16071632
**dict(zip(new_columns, len(new_columns) * [""]))
16081633
)
16091634

1635+
end_columns = self._get_end_columns(current_schema_headers=current_schema_headers,
1636+
existing_manifest_headers=existing_manifest_headers,
1637+
out_of_schema_columns=out_of_schema_columns)
1638+
16101639
# sort columns in the updated manifest:
16111640
# match latest schema order
16121641
# move obsolete columns at the end
16131642
updated_df = updated_df[self.sort_manifest_fields(updated_df.columns)]
1614-
updated_df = updated_df[[c for c in updated_df if c not in out_of_schema_columns] + list(out_of_schema_columns)]
16151643

1644+
# move obsolete columns at the end with entityId at the very end
1645+
updated_df = updated_df[[c for c in updated_df if c not in end_columns] + list(end_columns)]
16161646
return updated_df, out_of_schema_columns
16171647

16181648
def _format_new_excel_column(self, worksheet, new_column_index: int, col: str):

0 commit comments

Comments
 (0)