-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathglobal_code.py
More file actions
1873 lines (1621 loc) · 78.7 KB
/
Copy pathglobal_code.py
File metadata and controls
1873 lines (1621 loc) · 78.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import re
import numpy as np
import pandas as pd
from datetime import datetime
from functools import reduce
from io import StringIO
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import avg, col, concat, count, countDistinct, datediff, expr, first, format_string, lag, last, least, lit, log2, mean, months_between, regexp_extract, row_number, stddev, to_date, variance, when
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import shap
import matplotlib as mpl
from matplotlib import pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
def move_cols_to_front(df, cols_to_front):
original = df.columns
cols_to_front = [c for c in cols_to_front if c in original]
cols_other = [c for c in original if c not in cols_to_front]
df = df.select(*cols_to_front, *cols_other)
return df
def pandas_move_cols_to_front(df, cols_to_front):
original = df.columns
cols_to_front = [c for c in cols_to_front if c in original]
cols_other = [c for c in original if c not in cols_to_front]
df = df[cols_to_front + cols_other]
return df
def rename_cols_with_postfix(df, cols, postfix):
return df.select(*[col(c).alias(c+'_'+postfix) if c in cols else c for c in df.columns])
def keep_one_occurrence(df, feature_columns, date_column, first=False):
if not isinstance(feature_columns, list):
feature_columns = [feature_columns]
order = col(date_column).asc() if first else col(date_column).desc()
window = Window.partitionBy(*feature_columns).orderBy(order)
df = df.withColumn('row_number', row_number().over(window)) \
.where(col('row_number') == 1).drop('row_number')
return df
def keep_last_occurrence(df, feature_columns, date_column):
return keep_one_occurrence(df, feature_columns, date_column, first=False)
def keep_first_occurrence(df, feature_columns, date_column):
return keep_one_occurrence(df, feature_columns, date_column, first=True)
def compute_mutual_information(df, cid='concept_id', outcome='time_to_pasc', keeps=['concept_name', 'domain_id'], exclude_null_rows=True):
if exclude_null_rows: # col(cid).isNull()
df = df.where(col(cid).isNotNull())
df_all = df.groupBy(cid) \
.agg(count('person_id').alias('count_all'),
countDistinct('person_id').alias('count_all_distinct_person'))
df_pos = df.where(col(outcome).isNotNull() & (col(outcome) > 0)).groupBy(cid) \
.agg(count('person_id').alias('count_pos'),
countDistinct('person_id').alias('count_pos_distinct_person'))
count_cols = ['count_all', 'count_pos', 'count_all_distinct_person', 'count_pos_distinct_person']
stats = df_all.join(df_pos, on=cid, how='left') \
.fillna(0, subset=count_cols) \
.join(df.select(cid, *keeps).distinct(), on=cid, how='left')
n = df.select('person_id').distinct().count()
ny1 = df.where(col(outcome).isNotNull() & (col(outcome) > 0)).select('person_id').distinct().count()
if n == 0:
n = 1
py1 = ny1 / n
py0 = 1 - py1
# print(n, ny1, py1, py0)
stats = stats.withColumn('px1', col('count_all_distinct_person')/n) \
.withColumn('px0', 1 - col('px1')) \
.withColumn('px1y1', col('count_pos_distinct_person')/n) \
.withColumn('px1y0', (col('count_all_distinct_person') - col('count_pos_distinct_person'))/n) \
.withColumn('px0y1', (ny1 - col('count_pos_distinct_person'))/n) \
.withColumn('px0y0', (n - col('count_all_distinct_person') + col('count_pos_distinct_person') - ny1)/n) \
.withColumn('mutual_information',
col('px1y1') * log2(col('px1y1') / (col('px1')*py1) + 1e-9) + \
col('px1y0') * log2(col('px1y0') / (col('px1')*py0) + 1e-9) + \
col('px0y1') * log2(col('px0y1') / (col('px0')*py1) + 1e-9) + \
col('px0y0') * log2(col('px0y0') / (col('px0')*py0) + 1e-9) )
ret = stats.select(cid, *keeps,
F.round('mutual_information', 7).alias('mutual_information'),
*count_cols) \
.orderBy(col('mutual_information').desc())
ret = ret.limit(ret.count()) # force sort
return ret
def compute_conditional_mutual_information(df, condition_id_list, cid='concept_id', outcome='time_to_pasc', keeps=['concept_name', 'domain_id'], exclude_null_rows=True):
df_sub = df.groupBy('person_id').pivot('concept_id', condition_id_list).count()
filter_expr = ' AND '.join([f'(`{x}` > 0)' for x in condition_id_list])
df_sub = df_sub.where(expr(filter_expr)).select('person_id')
df = df.join(df_sub.select('person_id'), on='person_id')
stats = compute_mutual_information(df, cid, outcome, keeps, exclude_null_rows)
ret = stats.select(cid, *keeps,
F.round('mutual_information', 7).alias('cmi'))
return ret
def log_feature_list(df, list_name, cid='feature_id', cname='feature_name', score='group_score'):
df = df.select(cid, cname, score).toPandas()
print('#', list_name+':', len(df))
print(list_name+'_ids = [')
for index, row in df.iterrows():
print(" '{:s}', {} # {} ({})".format(row[cid],
' '*(10-len(row[cid])), row[cname], row[score]))
print(']\n')
# curation of concept sets and transformations for predefined tables
def get_broad_covid_diagnosis_set():
covid_set = [
37311061, # COVID-19
704996, # Patient meets COVID-19 laboratory diagnostic criteria
3661408, # Pneumonia caused by SARS-CoV-2
45756093, # Emergency use of U07.1 | COVID-19, virus identified
4100065, # Disease due to Coronaviridae
40479642, # Pneumonia due to Severe acute respiratory syndrome coronavirus
439676, # Coronavirus infection
3655976, # Acute hypoxemic respiratory failure due to disease caused by Severe acute respiratory syndrome coronavirus 2
37310286, # Infection of upper respiratory tract caused by Severe acute respiratory syndrome coronavirus 2
3655975, # Sepsis due to disease caused by Severe acute respiratory syndrome coronavirus 2
3661632, # Thrombocytopenia due to Severe acute respiratory syndrome coronavirus 2
3661406, # Acute respiratory distress syndrome due to disease caused by Severe acute respiratory syndrome coronavirus 2
]
return covid_set
def covid_index_from_measurement(concept, measurement, person_table, concept_set_members, use_latest=False, mark_all=False, join_person_table=False):
if use_latest:
test_codeset_id = 386776576
pos_codeset_id = 23400628 # most recent, includes 'Abnormal'
else: # replicates covid_index in silver standards
test_codeset_id = 651620200 # not the latest, does not include 36032419
pos_codeset_id = 400691529 # not the latest, does not include 'Abnormal'
covid_concept_id_list = concept_set_members \
.where(col("codeset_id") == test_codeset_id) \
.select('concept_id').toPandas()['concept_id'].tolist()
positive_concept_id_list = concept_set_members \
.where(col("codeset_id") == pos_codeset_id) \
.select('concept_id').toPandas()['concept_id'].tolist()
df = measurement \
.select('person_id', 'measurement_concept_id', # 'measurement_concept_name',
'measurement_date', 'value_as_concept_id') \
.where((col('measurement_concept_id').isin(covid_concept_id_list)) \
& (col('value_as_concept_id').isin(positive_concept_id_list)))
if not mark_all:
df = df.orderBy(col('measurement_date').asc()) \
.dropDuplicates(subset=['person_id']) \
.select('person_id', col('measurement_date').alias('covid_index'))
else:
df = df.select('person_id', col('measurement_date').alias('date'),
col('measurement_concept_id').alias('concept_id'),
# col('measurement_concept_name').alias('concept_name'),
) \
.withColumn('covid_test_positive', lit(1))
if join_person_table:
df = person_table.join(df, on='person_id', how='left')
df = df.join(concept.select('concept_id', 'domain_id', 'concept_name'), on='concept_id', how='left')
return df
def covid_index_from_concepts(merged_concepts, person_table, use_custom_covid_set=False, mark_all=False, join_person_table=False):
covid_set = [37311061]
if use_custom_covid_set:
covid_set = get_broad_covid_diagnosis_set()
df = merged_concepts \
.select('person_id', 'concept_id', 'concept_name', 'date') \
.where(col('concept_id').isin(covid_set))
if not mark_all:
df = df.orderBy(col('date').asc()) \
.dropDuplicates(subset=['person_id']) \
.select('person_id', col('date').alias('concept_covid_index'))
else:
df = df.select('person_id', 'date', 'concept_id', 'concept_name') \
.withColumn('covid_concept_positive', lit(1))
if join_person_table:
df = person_table.join(df, on='person_id', how='left')
return df
def covid_index_from_condition(condition_occurrence, person_table, use_custom_covid_set=False, mark_all=False, join_person_table=False):
covid_set = [37311061]
if use_custom_covid_set:
covid_set = get_broad_covid_diagnosis_set()
df = condition_occurrence \
.select('person_id', 'condition_concept_id', 'condition_start_date') \
.where(col('condition_concept_id').isin(covid_set))
if not mark_all:
df = df.orderBy(col('condition_start_date').asc()) \
.dropDuplicates(subset=['person_id']) \
.select('person_id', col('condition_start_date').alias('condition_covid_index'))
if join_person_table:
df = person_table.join(df, on='person_id', how='left')
return df
def create_or_add_to_custom_set(df, custom_set=None):
if custom_set is None:
return df
return custom_set.union(df)
def custom_concept_set_from_existing_set(concept_set_members, custom_set_name, custom_set_id, existing_set_name, custom_set=None):
df = concept_set_members.select('concept_id', 'concept_name', 'concept_set_name') \
.where(col('concept_set_name') == existing_set_name).drop('concept_set_name') \
.withColumn('custom_set_name', lit(custom_set_name)) \
.withColumn('custom_set_id', lit(custom_set_id))
return create_or_add_to_custom_set(df, custom_set)
def custom_concept_set_with_ids(concept_table, set_name, set_id, concept_id_list, custom_set=None):
df = concept_table.select('concept_id', 'concept_name')
df = df.where(col('concept_id').isin(concept_id_list)) \
.withColumn('custom_set_name', lit(set_name)) \
.withColumn('custom_set_id', lit(set_id))
return create_or_add_to_custom_set(df, custom_set)
def custom_concept_set_with_names(concept_table, set_name, set_id, concept_name_list, custom_set=None):
df = concept_table.select('concept_id', 'concept_name')
df = df.where(col('concept_name').isin(concept_name_list)) \
.withColumn('custom_set_name', lit(set_name)) \
.withColumn('custom_set_id', lit(set_id))
return create_or_add_to_custom_set(df, custom_set)
def custom_concept_set_with_names_containing(concept_table, set_name, set_id, pattern, custom_set=None):
df = concept_table.select('concept_id', 'concept_name')
df = df.where(col('concept_name').contains(pattern)) \
.withColumn('custom_set_name', lit(set_name)) \
.withColumn('custom_set_id', lit(set_id))
return create_or_add_to_custom_set(df, custom_set)
def custom_concept_set_with_names_starting_with(concept_table, set_name, set_id, pattern, custom_set=None):
df = concept_table.select('concept_id', 'concept_name')
df = df.where(col('concept_name').startswith(pattern)) \
.withColumn('custom_set_name', lit(set_name)) \
.withColumn('custom_set_id', lit(set_id))
return create_or_add_to_custom_set(df, custom_set)
def custom_concept_set_with_names_rlike(concept_table, set_name, set_id, pattern, custom_set=None, case_sensitive=False):
case = '' if case_sensitive else '(?i)'
df = concept_table.select('concept_id', 'concept_name')
df = df.where(col('concept_name').rlike(case+pattern)) \
.withColumn('custom_set_name', lit(set_name)) \
.withColumn('custom_set_id', lit(set_id))
return create_or_add_to_custom_set(df, custom_set)
def filter_concept_set_on_domain(df, concept_table, excludes=None, includes=None):
has_domain = 'domain_id' in df.columns
if not has_domain:
df_concept = concept_table.select('concept_id', 'domain_id')
df = df.join(df_concept, on='concept_id', how='left')
if excludes is not None:
df = df.where(~col('domain_id').isin(excludes))
if includes is not None:
df = df.where(col('domain_id').isin(includes))
if not has_domain:
df = df.drop('domain_id')
return df
# curation of concept sets
def get_selected_concept_set_names():
selected_set_names = ['covid19diagnosis', 'SS_EKG_1870024',
'Computed Tomography (CT) Scan',
'Critical Care [oneils]',
'[COVID19 Dx] Weak positive dx', #'Weak Positive (SNOMED codes)',
'Long COVID Clinic Visit',
'UVA Hyperlipidemia', 'Metabolic Disorders',
'LWW_remdesivir', '[DM] Aspirin', 'Anesthesia Medications (SIANES)',
'Immunosuppression L04', 'Systemic Antibiotics', '[DATOS] Diuretics',
'[PASC] Vasopressors', '[PASC] Antidepressant', '[RHDT] Vitamin D',
'[RP-6B45AE]Body/muscle pain and aches', 'pain-72', 'gestation',
'O2 device', 'RHDT - Supplemental Oxygen',
'[Cardioonc] Liquid Malignancies', 'Lung Cancer',
'Breast Cancer', 'Colorectal Cancer', 'UC - prostate cancer',
'[DM]Type1 Diabetes Mellitus', '[DM]Type2 Diabetes Mellitus',
'LongCOVIDFatigue', 'Long Hauler symptoms from LANCET paper',
'pasc_577',
]
return selected_set_names
def get_selected_bundle_dict():
selected_bundle_dict = {
'b001': 'ACE inhibitors',
'b002': 'ARBs - angiotensin II receptor blockers',
'b003': 'K-sparing diuretics',
'b004': 'Platelet inhibitors',
'b005': 'Antiarrhythmics',
'b006': 'PCSK9 inhibitors',
'b007': 'Long-acting nitrates',
'b008': 'ARNi - Angiotensin Receptor-Neprilysin Inhibitors',
'b009': 'Loop diuretics',
'b010': 'Alpha antagonists (BPH agents excluded)',
'b011': 'Thiazide/Thiazide-like diuretics',
'b012': 'Calcium channel blockers',
'b013': 'Beta blockers',
'b014': 'Minoxidil',
'b015': 'Central alpha 2 agonists',
'b016': 'Alpha antagonists',
'b017': 'Anticoagulants',
'b018': 'Hemodynamic/vasoactive agents',
'b019': 'COVID treatment - repurposed meds',
'b020': 'Chronic kidney diseases',
'b021': 'Monoclonal antibodies',
'b022': 'Cancer treatments',
'b023': 'Diabetes treatments',
'b024': 'Diabetes labs',
'b025': 'Diabetes kidney disease',
'b026': 'COVID tests',
'b027': 'COVID qualitative results',
'b028': 'COVID vaccines',
'b029': 'Blood gases',
'b030': 'Ventilation invasive treatments',
'b031': 'Glascow Coma Scale score',
'b032': 'Critical visits',
}
return selected_bundle_dict
def get_selected_bundle_names():
selected_bundle_names = list(get_selected_bundle_dict().values())
return selected_bundle_names
def get_additional_pasc_broad_conditions():
return [
77670 , # Chest pain
444070 , # Tachycardia
436096 , # Chronic pain
31967 , # Nausea
196523 , # Diarrhea
433316 , # Dizziness and giddiness
442752 , # Muscle pain
442588 , # Obstructive sleep apnea syndrome
437663 , # Fever
436962 , # Insomnia
75860 , # Constipation
27674 , # Nausea and vomiting
4169095 , # Bradycardia
318736 , # Migraine
314754 , # Wheezing
4236484 , # Paresthesia
4305080 , # Abnormal breathing
317376 , # Tachypnea
442165 , # Loss of appetite
40405599 , # Fibromyalgia
318800 , # Gastroesophageal reflux disease
313459 , # Sleep apnea
4195085 , # Nasal congestion
4034235 , # Tight chest
4182187 , # Foot swelling
436222 , # Altered mental status
440704 , # Chronic pain syndrome
434173 , # Fever symptoms
315361 , # Orthopnea
435524 , # Sleep disorder
135618 , # Pruritic rash
4275423 , # Supraventricular tachycardia
136184 , # Pruritus of skin
377575 , # Tinnitus
73754 , # Restless legs
139900 , # Urticaria
4057995 , # Excessively deep breathing
4021339 , # Feeling suicidal
439147 , # Amnesia
381549 , # Migraine with aura
4087166 , # Labored breathing
4273391 , # Suicidal thoughts
438134 , # Hypersomnia
134159 , # Precordial pain
4143064 , # Suicidal
4266361 , # Aggressive behavior
45763549 , # Bilateral tinnitus
436676 , # Posttraumatic stress disorder
4103295 , # Ventricular tachycardia
438867 , # Generalized aches and pains
197675 , # Incontinence of feces
45772876 , # Suffering
4196636 , # Dysarthria
4027314 , # Mental health impairment
197607 , # Excessive and frequent menstruation
375527 , # Headache disorder
373786 , # Abnormal vision
4133044 , # Chest discomfort
43531003 , # Essential tremor
4164633 , # Clouded consciousness
435657 , # Dyssomnia
378165 , # Nystagmus
4218878 , # Subjective vertigo
443432 , # Impaired cognition
439383 , # Vertigo
42872394 , # Daytime somnolence
381273 , # Confusional state
4094008 , # Bloating symptom
4302555 , # Menorrhagia
433031 , # Hallucinations
4184149 , # Feeling irritable
43022069 , # Primary central sleep apnea
4023572 , # Abdominal bloating
4304008 , # Memory impairment
4150129 , # Musculoskeletal pain
37311082 , # Erythematous rash
4239155 , # Diaphragmatic breathing
372886 , # Refractory migraine with aura
4129155 , # Vaginal bleeding
4059915 , # Fluttering heart
436681 , # Insomnia disorder related to known organic factor
256717 , # Bronchospasm
4021498 , # Panic attack
439708 , # Disorders of initiating and maintaining sleep
4012381 , # Restlessness
4029498 , # Seizure disorder
436235 , # Taste sense altered
4187507 , # Psychomotor agitation
194696 , # Dysmenorrhea
4319324 , # Polymyalgia
255348 , # Polymyalgia rheumatica
435786 , # Disorder of sleep-wake cycle
4011938 , # Intermittent vertigo
140803 , # Idiopathic urticaria
81893 , # Ulcerative colitis
313236 , # Cough variant asthma
4012870 , # Positional vertigo
4086811 , # Rapid shallow breathing
4159659 , # Postural orthostatic tachycardia syndrome
4078201 , # Mood swings
4150759 , # Myofascial pain
80141 , # Functional diarrhea
313792 , # Paroxysmal tachycardia
4036946 , # Floaters in visual field
439013 , # Insomnia disorder related to another mental disorder
381278 , # Cluster headache
45768908 , # Exercise induced bronchospasm
4214898 , # Decreased respiratory function
4245464 , # Flat affect
4223938 , # Dizziness
4096245 , # Resting tremor
439794 , # Central sleep apnea syndrome
381035 , # Vertigo of central origin
75580 , # Chronic ulcerative proctitis
4084730 , # Fidgeting
440087 , # Parasomnia
4302535 , # Slow shallow breathing
437579 , # Paroxysmal ventricular tachycardia
373463 , # Cough headache syndrome
4327815 , # Feeling angry
42538688 , # Chronic musculoskeletal pain
133834 , # Atopic dermatitis
45757810 , # Abnormal uterine bleeding
4137754 , # Secondary dysmenorrhea
4242106 , # Occult blood in stools
201627 , # Abnormal vaginal bleeding
37110753 , # Generalized rash
4012382 , # Weakness present
442187 , # Chronic paroxysmal hemicrania
381864 , # Subjective tinnitus
4012875 , # Constant vertigo
44783805 , # Hypogeusia
4046994 , # Transient paresthesia
373215 , # Latent nystagmus
3661632 , # Thrombocytopenia due to Severe acute respiratory syndrome coronavirus 2
433225 , # Ventricular flutter
37110488 , # Chronic insomnia
3661406 , # Acute respiratory distress syndrome due to disease caused by Severe acute respiratory syndrome coronavirus 2
4155092 , # C/O nasal congestion
45765899 , # Moderate cognitive impairment
4187714 , # Excessive somnolence
436522 , # Irregular sleep-wake pattern
4085732 , # Intermittent confusion
4316217 , # Primary fibromyalgia syndrome
4012422 , # Severe vertigo
376698 , # Internuclear ophthalmoplegia
317893 , # Paroxysmal supraventricular tachycardia
439150 , # Hypersomnia with sleep apnea
40479837 , # Chronic ulcerative colitis
440082 , # Persistent insomnia
4120400 , # Moody
4168860 , # Outbursts of anger
45773430 , # Complete fecal incontinence
43530629 , # Chronic vertigo
4096147 , # Poor concentration
4198855 , # Chronic urticaria
40480274 , # Nonsustained ventricular tachycardia
4012874 , # Paroxysmal vertigo
4105160 , # End-position nystagmus
132702 , # Erythema multiforme
]
def create_custom_concept_sets(concept_set_members, concept):
cs1 = custom_concept_set_from_existing_set(concept_set_members,
'maintenance fluids', 'x001',
'mf-fluids')
cs1 = custom_concept_set_with_names_rlike(concept,
'maintenance fluids', 'x001',
'sodium chloride.*(Injection|Cartridge|Syringe)', cs1)
cs2 = custom_concept_set_with_names_rlike(concept,
'sodium chloride nasal', 'x002',
'sodium chloride.*(Nasal|Inhalation)')
cs3 = custom_concept_set_with_names_starting_with(concept,
'chest x-ray', 'x003',
'Radiologic examination, chest')
cs3 = custom_concept_set_with_names(concept,
'chest x-ray', 'x003',
'Radiographic procedure of chest', cs3)
cs4 = custom_concept_set_with_names_starting_with(concept,
'x-ray', 'x004',
'Radiologic examination', cs3)
cs5 = custom_concept_set_with_names_starting_with(concept,
'outpatient visit', 'x005',
'Office or other outpatient visit')
cs6 = custom_concept_set_with_names_containing(concept,
'polyethylene glycol', 'x006',
'polyethylene glycol')
cs7 = custom_concept_set_from_existing_set(concept_set_members,
'CT scan', 'x007',
'Computed Tomography (CT) Scan')
cs7 = custom_concept_set_with_names_rlike(concept,
'CT scan', 'x007',
'^(Computed tomograph|CT of)', cs7)
cs8 = custom_concept_set_with_names_rlike(concept,
'CT chest', 'x008',
'^(CT|Computed tomograph).*(chest|lung)')
cs9 = custom_concept_set_from_existing_set(concept_set_members,
'depression', 'x009',
'ARIScience - Depression - JA')
cs9 = custom_concept_set_with_names(concept,
'depression', 'x009', [
'Depressed mood', 'Depressive episode',
'Adjustment disorder with depressed mood',
'Adjustment disorder with mixed anxiety and depressed mood',
'Feeling down, depressed or hopeless in last 2 weeks.frequency [Reported PHQ-9 CMS]',
'Symptoms of depression'], cs9)
cs10 = custom_concept_set_from_existing_set(concept_set_members,
'NSAID', 'x010',
'NSAIDs')
cs10 = custom_concept_set_from_existing_set(concept_set_members,
'NSAID', 'x010',
'[PASC] NSAID', cs10)
cs11 = custom_concept_set_with_ids(concept,
'COVID broad dx', 'x011',
get_broad_covid_diagnosis_set())
cs12 = custom_concept_set_from_existing_set(concept_set_members,
'PASC broad', 'x012',
'LongCOVIDFatigue')
cs12 = custom_concept_set_from_existing_set(concept_set_members,
'PASC broad', 'x012',
'Long Hauler symptoms from LANCET paper', cs12)
cs12 = custom_concept_set_with_ids(concept,
'PASC broad', 'x012',
get_additional_pasc_broad_conditions(), cs12)
cs13 = custom_concept_set_from_existing_set(concept_set_members,
'covid weak dx', 'x013',
'[COVID19 Dx] Weak positive dx')
cs13 = filter_concept_set_on_domain(cs13, concept, excludes='Drug')
cs14 = custom_concept_set_from_existing_set(concept_set_members,
'drugs for covid weak dx', 'x014',
'[COVID19 Dx] Weak positive dx')
cs14 = filter_concept_set_on_domain(cs14, concept, includes='Drug')
cs14 = custom_concept_set_with_names(concept,
'drugs for covid weak dx', 'x014',
['fluticasone'], cs14)
cs15 = custom_concept_set_from_existing_set(concept_set_members,
'all pain', 'x015',
'pain-72')
cs15 = custom_concept_set_with_names(concept,
'all pain', 'x015', [
'Characteristic of pain',
'Pain of right knee joint',
'Pain of left knee region',
'Pain of knee region',
'Pain management (specialty)',
'Temporomandibular joint-pain-dysfunction syndrome',
'[X]Other chest pain',
'Joint pain in ankle and foot',
], cs15)
cs16 = custom_concept_set_with_names(concept,
'non-hospital visit', 'x016', [
'Telehealth',
'Non-hospital institution Visit',
])
all_cs = [cs1, cs2, cs3, cs4, cs5, cs6, cs7, cs8, cs9, cs10, cs11, cs12, cs13, cs14, cs15, cs16]
custom_sets = reduce(DataFrame.union, all_cs).distinct()
return custom_sets
def get_custom_concept_set_ids():
custom_set_ids = ['x{:03d}'.format(i) for i in range(1, 17)]
return custom_set_ids
def get_selected_concept_ids():
selected_concept_ids = [
705076, # Post-acute COVID-19
37311061, # COVID-19
37310285, # Pneumonia caused by SARS-CoV-2 ...
3661408, # Pneumonia caused by SARS-CoV-2
312437, # Dyspnea
4223659, # Fatigue
9201, # Inpatient Visit
9202, # Outpatient Visit
9203, # Emergency Room Visit
3005879, # First Respiration rate Set
3019237, # Chief complaint - Reported
4203942, # Admitting diagnosis
40217302, # Clinical decision support mechanism ...
3010247, # Hospital discharge Dx
4203722, # Patient encounter procedure
2313990, # Duplex scan of extremity veins ...
45765920, # Never used tobacco
45765917, # Ex-tobacco user
4138763, # Acceptable pain level status
4192791, # Pain management (specialty)
37016200, # Exposure to viral disease
36714927, # Sequelae of infectious disease
2786229, # Introduction of Anti-inflammatory ...
2787749, # Introduction of Anti-inflammatory ...
4307376, # Final inpatient visit with instructions ...
4093836, # Glasgow coma score
759536, # Infectious disease (viral respiratory ...
2313869, # Echocardiography, transthoracic, ...
439224, # Allergy to drug
1149380, # fluticasone
40483286, # Critical illness myopathy
4152283, # Main spoken language
5083, # Telehealth
42898160, # Non-hospital institution Visit
254761, # Cough
4034235, # Tight chest
4182187, # Foot swelling
434173, # Fever symptoms
35605482, # 2 ML ondansetron 2 MG/ML Injection
38004250, # Ambulatory Radiology Clinic / Center
725069, # Radiologic examination, chest; 2 views
19020053, # acetaminophen 500 MG Oral Tablet
4193704, # Type 2 diabetes mellitus without complication
37311059, # Exposure to SARS-CoV-2
257011, # Acute upper respiratory infection
4195694, # Acute respiratory distress syndrome
2615309, # Non-covered item or service
36308879, # Never used
]
return selected_concept_ids
def get_feature_to_category_dict():
demographics = [
'year_of_birth',
'gender_concept_id',
'race_concept_id',
'ethnicity_concept_id',
]
vaccine = [
'vaccine_txn',
'vax_time_to_covid',
]
data_source = [
'data_partner_id',
'd124',
'd399',
'd134',
'd888',
'd75',
'd526',
'd294',
'd798',
'd569',
'd828',
'zip_id',
]
covid_diagnosis = [
'num_covid_episodes',
'total_episode_length',
'max_episode_length',
'months_from_covid_index',
'months_from_first_covid',
'x013', # X: covid weak dx
'b026', # B: COVID tests
]
visits = [
'months_from_observation_end',
'covid_index_to_observation_end',
'covid_index_to_last_visit_end',
'months_from_last_visit',
'months_from_first_visit',
'num_visits',
'max_visit_length',
'total_visit_length',
'x005', # X: outpatient visit
'x016', # X: non-hospital visit
's809092939', # A: Outpatient
's231912125', # A: Emergency Room Visits
's122805878', # S: Long COVID Clinic Visit
's972465851', # A: Hospitalization
'b032', # B: Critical visits
'c9202', # C: Outpatient Visit
'c4203722', # C: Patient encounter procedure
'c38004250', # C: Ambulatory Radiology Clinic / Center
'c5083', # C: Telehealth
]
condition_occurrences = [
'x012', # X: PASC broad
'x003', # X: chest x-ray
'x015', # X: all pain
'x014', # X: drugs for covid weak dx
'x010', # X: NSAID
'x001', # X: maintenance fluids
'x007', # X: CT scan
's400852949', # A: Cough
's348930395', # S: LongCOVIDFatigue
's947341641', # A: Albuterol
's402442153', # A: Fatigue
's571514014', # A: Drug Corticosteroids Systemic
's799270877', # A: Hypertension
's961458756', # A: Respiratory Disorder
's875380843', # S: Metabolic Disorders
's546116974', # A: Inflammation Resp
's562288165', # A: Acute Disease
's689261095', # A: Dyspnea
's313998782', # S: gestation
's295952371', # A: Drug Fentanyl
's959049707', # S: Long Hauler symptoms from LANCET paper
's995642183', # S: Immunosuppression L04
's812896616', # A: Abdominal Pain
's541737679', # A: ARDS
's672055106', # A: Cancer
's777909573', # A: Electrolyte IV
's160955543', # A: Alcohol
's305217555', # A: Vaccines
's533705532', # A: Obesity
's908267911', # A: Chest Pain
's540936013', # A: Cerebral
's478273027', # A: Renal Limited
's777788102', # S: [PASC] Antidepressant
's892966630', # A: Prednisone
's216227734', # S: Anesthesia Medications (SIANES)
's481594340', # A: Vertigo Dizziness
's967701631', # A: Mets CCI
's275632871', # A: Non Smoker
's414583656', # A: Chemotherapy Endocrine
's849907658', # A: Insomnia
's379995033', # A: Elevated Cholesterol
's603185723', # A: Drug Clonazepam
's456422283', # A: Drug Pseudoephedrine
's949429075', # A: Weight Loss
'b017', # B: Anticoagulants
'c36714927', # C: Sequelae of infectious disease
'c37016200', # C: Exposure to viral disease
'c3019237', # C: Chief complaint - Reported
'c4223659', # C: Fatigue
'c725069', # C: Radiologic examination, chest; 2 views
'c35605482', # C: 2 ML ondansetron 2 MG/ML Injection
'c3661408', # C: Pneumonia caused by SARS-CoV-2
'c254761', # C: Cough
'c19020053', # C: acetaminophen 500 MG Oral Tablet
'c257011', # C: Acute upper respiratory infection
'c4193704', # C: Type 2 diabetes mellitus without complication
'c1149380', # C: fluticasone
]
measurements = [
'3012888', # Diastolic blood pressure
'3004501', # Glucose [Mass/volume] in Serum or Plasma
'706163', # SARS-CoV-2 (COVID-19) RNA [Presence] in Respiratory specimen by NAA with probe detection
'3016723', # Creatinine [Mass/volume] in Serum or Plasma
'3024128', # Bilirubin.total [Mass/volume] in Serum or Plasma
'3020891', # Body temperature
'3013682', # Urea nitrogen [Mass/volume] in Serum or Plasma
'3000963', # Hemoglobin [Mass/volume] in Blood
'3013762', # Body weight Measured
'3013650', # Neutrophils [#/volume] in Blood by Automated count
'3014576', # Chloride [Moles/volume] in Serum or Plasma
'3004249', # Systolic blood pressure
'40762499', # Oxygen saturation in Arterial blood by Pulse oximetry
'3019550', # Sodium [Moles/volume] in Serum or Plasma
'4099154', # Body weight
'4301868', # Pulse rate
'3024171', # Respiratory rate
'3004327', # Lymphocytes [#/volume] in Blood by Automated count
'3027018', # Heart rate
'3009744', # MCHC [Mass/volume] by Automated count
'3023103', # Potassium [Moles/volume] in Serum or Plasma
'3012030', # MCH [Entitic mass] by Automated count
'3019800', # Troponin T.cardiac [Mass/volume] in Serum or Plasma
'3028615', # Eosinophils [#/volume] in Blood by Automated count
'3023314', # Hematocrit [Volume Fraction] of Blood by Automated count
'3024929', # Platelets [#/volume] in Blood by Automated count
'3003841', # Heart rate Peripheral artery by palpation
'4154790', # Diastolic blood pressure
'3006906', # Calcium [Mass/volume] in Serum or Plasma
'4313591', # Respiratory rate
]
feature_to_category = {}
for feature in demographics:
feature_to_category[feature] = 'Demographics'
for feature in data_source:
feature_to_category[feature] = 'Data source'
for feature in visits:
feature_to_category[feature] = 'Visits'
for feature in vaccine:
feature_to_category[feature] = 'Vaccine'
for feature in covid_diagnosis:
feature_to_category[feature] = 'COVID diagnosis'
for feature in condition_occurrences:
feature_to_category[feature] = 'Conditions'
for feature in measurements:
feature_to_category[feature] = 'Measurements'
return feature_to_category
def compute_covid_diagnostic_windows(covid_dates, silver,
reinfection_threshold=92, min_covid_episode_length=7):
df = covid_dates.where(expr('covid_test_positive > 0') | expr('covid_concept_positive > 0')) \
.withColumn('covid_diagnosis', lit(1))
df = silver.select('person_id', 'covid_index') \
.join(df, on='person_id', how='left') \
.withColumn('covid_days', datediff(col('date'), col('covid_index')))
person_window = Window.partitionBy('person_id').orderBy('date')
df = df.withColumn('lag', datediff(col('date'), lag('date', 1).over(person_window))) \
.withColumn('jump', (col('lag').isNull() | (col('lag') > reinfection_threshold)).cast('int')) \
.withColumn('covid_episode', F.sum(col('jump')).over(person_window)).drop('jump', 'lag')
episode_window = Window.partitionBy('person_id', 'covid_episode').orderBy('date')
df = df.withColumn('lag', datediff(col('date'), lag('date', 1).over(episode_window))) \
.withColumn('increment', when(col('lag').isNull(), min_covid_episode_length).otherwise(col('lag'))) \
.withColumn('episode_days', F.sum(col('increment')).over(episode_window)).drop('increment', 'lag')
episode_desc_window = Window.partitionBy('person_id', 'covid_episode').orderBy(col('date').desc())
df2 = df.withColumn('row', row_number().over(episode_desc_window)) \
.where(col('row') == 1).drop('row') \
.select('person_id', 'covid_episode', col('episode_days').alias('episode_length'))
df3 = df2.groupBy('person_id').agg(
F.sum('episode_length').alias('total_episode_length'),
F.max('episode_length').alias('max_episode_length'),
F.max('covid_episode').alias('num_covid_episodes'))
df = df.join(df2, on=['person_id', 'covid_episode'], how='left') \
.join(df3, on='person_id', how='left')
df_episode = df.withColumn('episode_name', concat(lit('covid_'), col('covid_episode'))) \
.groupBy('person_id').pivot('episode_name').agg(
first('date').alias('first'),
last('date').alias('last')
).drop('episode_name')
df = df.join(df_episode, on='person_id', how='left')
df = df \
.withColumn('months_from_first_covid',
F.round(months_between(to_date(lit('2023-03-31')), col('covid_1_first')), 1)) \
.withColumn('months_from_covid_index',
F.round(months_between(to_date(lit('2023-03-31')), col('covid_index')), 1))
df = move_cols_to_front(df, ['person_id', 'time_to_pasc', 'covid_days', 'date', 'covid_episode', 'episode_length', 'num_covid_episodes', 'total_episode_length', 'max_episode_length', 'episode_days'])
df = df.orderBy('person_id', 'date')
return df
def map_feature_to_name(df, col='feature', df_feature=None, df_concept=None):
# df_feature, if provided, should have columns: feature_id, feature_name
# df_concept, if provided, should have columns: concept_id, concept_name
if df_feature and 'feature_id' not in df.columns:
df = df.withColumn('feature_id', regexp_extract(col, r'^([abcsx]\d+?)(_.*|)$', 1))
if df_feature and 'feature_name' not in df.columns:
df = df.join(df_feature.select('feature_id', 'feature_name').distinct(),
on='feature_id', how='left')
if df_concept and 'concept_id' not in df.columns:
df = df.withColumn('concept_id', regexp_extract(col, r'^(\d{4,})(_.*|)$', 1))
if df_concept and 'concept_name' not in df.columns:
cols = ['concept_id', 'concept_name']
if 'domain_id' not in df.columns and 'domain_id' in df_concept.columns:
cols.append('domain_id')
df = df.join(df_concept.select('concept_id', 'concept_name', 'domain_id'),
on='concept_id', how='left')
return df
def data_partner_id_to_onehot(person_table, subset='dp10'):
if subset == 'dp10':
subset = [124, 399, 134, 888, 75, 526, 294, 798, 569, 828]
df = person_table.select('person_id', 'data_partner_id') \
.withColumn('dp', concat(lit('d'), col('data_partner_id')))
if subset is not None:
subset = [f'd{x}' if isinstance(x, int) else x for x in subset]
df = df.groupBy('person_id').pivot('dp', subset).agg(count('person_id'))
df = df.fillna(0)
return df
# temporal feature engineering
def temporal_engineered_concept_features(person_concept_features, pivot_cols, agg_concepts=True):
df_features = person_concept_features
distinct_cols = [] if agg_concepts else ['concept_id']
df_all = df_features \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_all.count()
df_before = df_features.where(expr('covid_days < -7')) \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_before = rename_cols_with_postfix(df_before, pivot_cols, 'before')
df_before.count()
df_during = df_features.where(expr('covid_days >= -7') & expr('covid_days <= 7')) \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_during = rename_cols_with_postfix(df_during, pivot_cols, 'during')
df_during.count()
df_after = df_features.where(expr('covid_days > 7') & expr('covid_days <= 28')) \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_after = rename_cols_with_postfix(df_after, pivot_cols, 'after')
df_after.count()
df_b1w = df_features.where(expr('covid_days >= -7') & expr('covid_days < 0')) \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_b1w = rename_cols_with_postfix(df_b1w, pivot_cols, 'b1w')
df_b1w.count()
df_a1w = df_features.where(expr('covid_days >= 0') & expr('covid_days <= 7')) \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(countDistinct('date', *distinct_cols))
df_a1w = rename_cols_with_postfix(df_a1w, pivot_cols, 'a1w')
df_a1w.count()
coeff = np.log(10) / 28
df_weight = df_features.select('person_id', 'covid_days', 'feature_id').distinct() \
.withColumn('time_weight',
when(expr('covid_days > 28'), lit(10)) \
.when(expr('covid_days >= -7'), F.exp(col('covid_days') * coeff)) \
.otherwise(lit(0.5)))
df_weight.count()
df_weight_sum = df_weight \
.groupBy('person_id') \
.pivot('feature_id', pivot_cols) \
.agg(F.sum('time_weight'))
df_weight_sum = rename_cols_with_postfix(df_weight_sum, pivot_cols, 'weight_sum')
df_weight_sum.count()