hospital-chargemaster/data/advent-health/parse.py at master · vsoch/hospital-chargemaster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python

import os
import xmltodict
from glob import glob
import json
import datetime
import pandas

here = os.path.dirname(os.path.abspath(__file__))
folder = os.path.basename(here)

year = datetime.datetime.today().year
output_data = os.path.join(here, 'data-latest.tsv')
output_year = os.path.join(here, 'data-%s.tsv' % year)

latest = '%s/latest' % here

# Don't continue if we don't have latest folder
if not os.path.exists(latest):
    print('%s does not have parsed data.' % folder)
    sys.exit(0)

# Don't continue if we don't have results.json
results_json = os.path.join(latest, 'records.json')
if not os.path.exists(results_json):
    print('%s does not have results.json' % folder)
    sys.exit(1)

with open(results_json, 'r') as filey:
    results = json.loads(filey.read())

columns = ['charge_code',
           'price',
           'description',
           'hospital_id',
           'filename',
           'charge_type']

df = pandas.DataFrame(columns=columns)

# Helper Functions - different formats of XML
def process_dataroot(content, df, filename):

    # Hospital name is the key that doesn't start with @
    for hospital_id in content['dataroot'].keys():
        if not hospital_id.startswith('@'):
            break

    for entry in content['dataroot'][hospital_id]:
        # ed means entry dict
        idx = df.shape[0] + 1
        ed = dict()
        for item, value in entry.items():
            if "code" in item.lower():
                ed['charge_code'] = value
            elif "description" in item.lower():
                ed['description'] = value
            elif "price" in item.lower():
                ed['price'] = value
        row = [ed['charge_code'], ed['price'], ed['description'], hospital_id, filename, "standard"]
        df.loc[idx, :] = row

    return df


def process_workbook(content, df, hospital_id, filename):
    # First row is header
    for r in range(1, len(content['Workbook']['Worksheet']['Table']['Row'])):
        idx = df.shape[0] + 1
        row = content['Workbook']['Worksheet']['Table']['Row'][r]
        description = row['Cell'][0]['Data']['#text']
        price = row['Cell'][1]['Data']['#text']
        items = [None, price, description, hospital_id, filename, "standard"]
        df.loc[idx, :] = items
    return df


seen = []
for result in results:
    filename = os.path.join(latest, result['filename'])
    if not os.path.exists(filename):
        print('%s is not found in latest folder.' % filename)
        continue

    if os.stat(filename).st_size == 0:
        print('%s is empty, skipping.' % filename)
        continue

    if result['filename'] in seen:
        continue

    seen.append(result['filename'])
    print('Parsing %s' % filename)

    # Option 1: parse an XML file
    if filename.endswith('xml'):
        with open(filename, 'r') as filey:
            content = xmltodict.parse(filey.read())

        if "dataroot" in content:
            df = process_dataroot(content, df, filename)
        elif "Workbook" in content:
            df = process_workbook(content, df, result['uri'], result['filename'])

    # Save data as we go
    print(df.shape)
    df.to_csv(output_data, sep='\t', index=False)
    df.to_csv(output_year, sep='\t', index=False)