diff --git a/dataownertools/clean.py b/dataownertools/clean.py new file mode 100644 index 0000000..1f90cb4 --- /dev/null +++ b/dataownertools/clean.py @@ -0,0 +1,29 @@ +import unicodedata + +def name(name): + if name is None: + return None + ascii_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore') + return ascii_name.strip().upper().decode('ascii') + +def phone(phone): + if phone is None: + return None + return ''.join(filter(lambda x: x.isdigit(), phone.strip())) + +def address(address): + if address is None: + return None + ascii_address = unicodedata.normalize('NFKD', address).encode('ascii', 'ignore') + return ascii_address.strip().upper().decode('ascii') + +def zip(zip): + if zip is None: + return None + return zip.strip() + +def email(email): + if email is None: + return None + ascii_email = unicodedata.normalize('NFKD', email).encode('ascii', 'ignore') + return ascii_email.strip().upper().decode('ascii') \ No newline at end of file diff --git a/dataownertools/report.py b/dataownertools/report.py new file mode 100644 index 0000000..ac2cda9 --- /dev/null +++ b/dataownertools/report.py @@ -0,0 +1,26 @@ +from collections import Counter + +class Report: + def __init__(self, fields): + self.field_counters = {} + for f in fields: + self.field_counters[f] = Counter() + + def validate(self, field_name, value): + if value is None: + self.field_counters[field_name]['NULL Value'] += 1 + return + if not value.isascii(): + self.field_counters[field_name]['Contains Non-ASCII Characters'] += 1 + if not value.isprintable(): + self.field_counters[field_name]['Contains Non-printable Characters'] += 1 + if value.isspace(): + self.field_counters[field_name]['Empty String'] += 1 + + def print(self): + for field, counter in self.field_counters.items(): + print(field) + print('--------------------') + for issue, count in counter.items(): + print("{}: {}".format(issue, count)) + print('') \ No newline at end of file diff --git a/extract.py b/extract.py index d072d46..6c149c9 100644 --- a/extract.py +++ b/extract.py @@ -1,50 +1,12 @@ import csv import argparse import unicodedata +from dataownertools import clean, report from collections import Counter from random import shuffle from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.sql import select -def validate(report, field, value): - if value is None: - report[field]['NULL Value'] += 1 - return - if not value.isascii(): - report[field]['Contains Non-ASCII Characters'] += 1 - if not value.isprintable(): - report[field]['Contains Non-printable Characters'] += 1 - if value.isspace(): - report[field]['Empty String'] += 1 - -def clean_name(name): - if name is None: - return None - ascii_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore') - return ascii_name.strip().upper().decode('ascii') - -def clean_phone(phone): - if phone is None: - return None - return ''.join(filter(lambda x: x.isdigit(), phone.strip())) - -def clean_address(address): - if address is None: - return None - ascii_address = unicodedata.normalize('NFKD', address).encode('ascii', 'ignore') - return ascii_address.strip().upper().decode('ascii') - -def clean_zip(zip): - if zip is None: - return None - return zip.strip() - -def clean_email(email): - if email is None: - return None - ascii_email = unicodedata.normalize('NFKD', email).encode('ascii', 'ignore') - return ascii_email.strip().upper().decode('ascii') - def case_insensitive_lookup(row, desired_key): if row.has_key(desired_key): return row[desired_key] @@ -57,9 +19,7 @@ def case_insensitive_lookup(row, desired_key): 'household_street_address', 'household_zip', 'parent_given_name' , 'parent_family_name', 'parent_email'] -report = {} -for h in header: - report[h] = Counter() +report = report.Report(header) export_count = 0 @@ -81,34 +41,34 @@ def case_insensitive_lookup(row, desired_key): for row in results: output_row = [case_insensitive_lookup(row, 'patid')] given_name = case_insensitive_lookup(row, 'given_name') - validate(report, 'given_name', given_name) - output_row.append(clean_name(given_name)) + report.validate('given_name', given_name) + output_row.append(clean.name(given_name)) family_name = case_insensitive_lookup(row, 'family_name') - validate(report, 'family_name', family_name) - output_row.append(clean_name(family_name)) + report.validate('family_name', family_name) + output_row.append(clean.name(family_name)) birth_date = case_insensitive_lookup(row, 'birth_date') output_row.append(birth_date.isoformat()) sex = case_insensitive_lookup(row, 'sex') - validate(report, 'sex', sex) + report.validate('sex', sex) output_row.append(sex.strip()) phone_number = case_insensitive_lookup(row, 'household_phone') - validate(report, 'phone_number', phone_number) - output_row.append(clean_phone(phone_number)) + report.validate('phone_number', phone_number) + output_row.append(clean.phone(phone_number)) household_street_address = case_insensitive_lookup(row, 'household_street_address') - validate(report, 'household_street_address', household_street_address) - output_row.append(clean_address(household_street_address)) + report.validate('household_street_address', household_street_address) + output_row.append(clean.address(household_street_address)) household_zip = case_insensitive_lookup(row, 'household_zip') - validate(report, 'household_zip', household_zip) - output_row.append(clean_zip(household_zip)) + report.validate('household_zip', household_zip) + output_row.append(clean.zip(household_zip)) parent_given_name = case_insensitive_lookup(row, 'parent_given_name') - validate(report, 'parent_given_name', parent_given_name) - output_row.append(clean_name(parent_given_name)) + report.validate('parent_given_name', parent_given_name) + output_row.append(clean.name(parent_given_name)) parent_family_name = case_insensitive_lookup(row, 'parent_family_name') - validate(report, 'parent_family_name', parent_family_name) - output_row.append(clean_name(parent_family_name)) + report.validate('parent_family_name', parent_family_name) + output_row.append(clean.name(parent_family_name)) parent_email = case_insensitive_lookup(row, 'household_email') - validate(report, 'parent_email', parent_email) - output_row.append(clean_email(parent_email)) + report.validate('parent_email', parent_email) + output_row.append(clean.email(parent_email)) output_rows.append(output_row) export_count += 1 @@ -123,10 +83,4 @@ def case_insensitive_lookup(row, desired_key): print('Total records exported: {}'.format(export_count)) print('') -for field, counter in report.items(): - print(field) - print('--------------------') - for issue, count in counter.items(): - print("{}: {}".format(issue, count)) - print('') - +report.print() diff --git a/fhirextract.py b/fhirextract.py new file mode 100644 index 0000000..e29d05c --- /dev/null +++ b/fhirextract.py @@ -0,0 +1,67 @@ +import csv +import argparse +import unicodedata +import ndjson +from dataownertools import clean, report +from collections import Counter +from random import shuffle + +header = ['record_id', 'given_name', 'family_name', 'DOB', 'sex', 'phone_number', + 'household_street_address', 'household_zip', 'parent_given_name' , 'parent_family_name', + 'parent_email'] + +report = report.Report(header) + +export_count = 0 + +parser = argparse.ArgumentParser(description='Tool for extracting, validating and cleaning data for CODI PPRL') +parser.add_argument('--bulkfile', nargs=1, required=True, help='Path to bulk FHIR patient resources') +args = parser.parse_args() + +bulkfile_path = args.bulkfile[0] + +output_rows = [] + +with open(bulkfile_path) as f: + reader = ndjson.reader(f) + + # TODO: null safe search when digging through the Patient resource + for patient in reader: + patient_row = [] + record_id = patient['id'] + patient_row.append(record_id) + given_name = patient['name'][0]['given'][0] + report.validate('given_name', given_name) + patient_row.append(clean.name(given_name)) + family_name = patient['name'][0]['family'] + report.validate('family_name', family_name) + patient_row.append(clean.name(family_name)) + patient_row.append(patient['birthDate']) + sex = patient['gender'] + report.validate('sex', sex) + patient_row.append(sex[0].upper()) + phone_number = patient['telecom'][0]['value'] + report.validate('phone_number', phone_number) + patient_row.append(clean.phone(phone_number)) + household_street_address = patient['address'][0]['line'][0] + report.validate('household_street_address', household_street_address) + patient_row.append(clean.address(household_street_address)) + household_zip = patient['address'][0].get('postalCode') + report.validate('household_zip', household_zip) + patient_row.append(clean.zip(household_zip)) + patient_row.append("") + patient_row.append("") + patient_row.append("") + output_rows.append(patient_row) + export_count += 1 + +shuffle(output_rows) + +with open('pii.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(header) + for output_row in output_rows: + writer.writerow(output_row) + +print('Total records exported: {}'.format(export_count)) +print('') diff --git a/requirements.txt b/requirements.txt index 6c39729..6c461a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ clkhash>=0.16.0 psycopg2>=2.8.3 anonlink-client>=0.1.4 ijson>=3.1.2 +ndjson>=0.3.1 \ No newline at end of file