Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os
import re
import subprocess
from typing import List
from typing import List, Tuple

import pandas as pd
import pyranges
Expand Down Expand Up @@ -104,7 +104,7 @@


# TODO: Add to transform.py
def _to_redact_interval(df_col):
def _to_redact_interval(df_col: pd.Series) -> Tuple[pd.Series, pd.Series]:
"""
Determines year values that are "<18" and interval values >89 that need to be redacted
Returns bool because BIRTH_YEAR needs to be redacted as well based
Expand All @@ -131,7 +131,7 @@ def _to_redact_interval(df_col):


# TODO: Add to transform.py
def _redact_year(df_col):
def _redact_year(df_col: pd.Series) -> pd.Series:
"""Redacts year values that have < or >

Args:
Expand All @@ -149,8 +149,26 @@ def _redact_year(df_col):
return df_col


def _redact_ped_year(df_col: pd.Series) -> pd.Series:
"""Redacts year values that have <

Args:
df_col: Dataframe column/pandas.Series of a year column

Returns:
pandas.Series: Redacted series

"""
year = df_col.astype(str)
contain_lessthan = year.str.contains("<", na=False)
df_col[contain_lessthan] = "withheld"
return df_col


# TODO: Add to transform.py
def _to_redact_difference(df_col_year1, df_col_year2):
def _to_redact_difference(
df_col_year1: pd.Series, df_col_year2: pd.Series
) -> pd.Series:
"""Determine if difference between year2 and year1 is > 89

Args:
Expand All @@ -170,8 +188,9 @@ def _to_redact_difference(df_col_year1, df_col_year2):

# TODO: Add to transform.py
def redact_phi(
clinicaldf, interval_cols_to_redact=["AGE_AT_SEQ_REPORT", "INT_CONTACT", "INT_DOD"]
):
clinicaldf: pd.DataFrame,
interval_cols_to_redact: list = ["AGE_AT_SEQ_REPORT", "INT_CONTACT", "INT_DOD"],
) -> pd.DataFrame:
"""Redacts the PHI by re-annotating the clinical file

Args:
Expand Down Expand Up @@ -205,6 +224,10 @@ def redact_phi(
)
clinicaldf.loc[to_redact, "BIRTH_YEAR"] = "cannotReleaseHIPAA"

# redact range year for pediatric data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might have missed this in the discussion but to confirm, YEAR_CONTACT and YEAR_DEATH are also phi so that's why they are being redacted as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Xindi referred to the AC of the ticket:'Keep standardizing the center uploaded masked data as "withheld"' and confirmed that we need to redact these two columns. I believe these are phi but the reason to redact should be more related to standardization.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. Sounds good as long as that's clearly documented, see JIRA comment

clinicaldf["YEAR_CONTACT"] = _redact_ped_year(clinicaldf["YEAR_CONTACT"])
clinicaldf["YEAR_DEATH"] = _redact_ped_year(clinicaldf["YEAR_DEATH"])

return clinicaldf


Expand Down
Loading
Loading