-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLipidomicTranscriptomic.Rmd
3585 lines (2937 loc) · 164 KB
/
LipidomicTranscriptomic.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Lipidomic and Transcriptomic Analysis Project"
author: "Anni Liu"
date: "October 9, 2022"
output:
word_document:
fig_height: 4.5
fig_width: 4.5
html_document:
df_print: paged
pdf_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
options(scipen = 999, digits = 4),
cache = TRUE,
error = FALSE,
message = FALSE,
warning = FALSE,
tidy.opts = list(width.cutoff = 60),
tidy = TRUE,
fig.width = 12,
fig.height = 8
)
```
# Load saved images and libraries
```{r start, eval=FALSE}
# Load image
load(file = "2022Oct9.RData")
# Save image
save.image(file = "2022Oct9.RData")
# Load libraries
easypackages::libraries("tidyverse", "ggpubr", "gridExtra", "broom", "circlize", "ComplexHeatmap", "Hmisc") # !
```
```{r}
lipid_structure <- readRDS("lipid_structure.RDS")
```
# Data preprocess for lipidomics
## Import data
```{r eval=FALSE}
# Load and wrangle lipidomics data. These lipid data include the individual lipid species and the macro lipid groups containing individual lipid species.
## PosConcSamples Data
lipid_data_1 <- readxl::read_xlsx("MSK-WCMC Data w CVs_18FEB21.xlsx", sheet = "PosConcSamples Data", range = c("A3:CX267"), col_names = TRUE) %>% rename("Sample ID" = "Match", "Replicate" = "Sample_Replicate", "Sample Name PosConc" = "Row Labels")
## NegConcSamples Data
lipid_data_2 <- readxl::read_xlsx("MSK-WCMC Data w CVs_18FEB21.xlsx", sheet = "NegConcSamples Data", range = "A3:EC267", col_names = TRUE) %>% rename("Sample ID" = "...1", "Replicate" = "...2", "Sample Name NegConc" = "Row Labels")
## PosSphingosineConc Data
lipid_data_3 <- readxl::read_xlsx("MSK-WCMC Data w CVs_18FEB21.xlsx", sheet = "PosSphingosineConc Data", range = "A2:G266", col_names = TRUE) %>% rename("Sample Name PosSphingosineConc" = "Sample")
## Merge all lipid datasets
lipid_data_full <- full_join(lipid_data_1, lipid_data_2, by = c("Sample ID" = "Sample ID", "Replicate" = "Replicate")) %>% full_join(lipid_data_3, by = c("Sample ID" = "Sample ID", "Replicate" = "Replicate"))
## Remove unnecessary columns
lipid_data_full <- lipid_data_full %>% select(-c("Replicate", "Sample Name PosConc", "Sample Name NegConc", "Sample Name PosSphingosineConc", "SampleList.File Text.x", "SampleList.File Text.y", "SampleList.File Text"))
# Number of patients
length(unique(lipid_data_full$`Sample ID`)) # There are 88 patients
# Number of lipid species
ncol(lipid_data_full) - 1 # There are 230 types of lipid species, including 13 macro lipid groups
a <- colnames(lipid_data_full)[-1]
# Print out lipid names
data.frame(Macro = a[grep("macro", a)])
data.frame(Individual = a[grep("macro", a, invert = TRUE)])
# Save cleaned data to disk
saveRDS(object = lipid_data_full, file = "lipid_data_full.RDS")
# Load cleaned lipid data
lipid_data_full <- readRDS(file = "lipid_data_full.RDS")
```
## Filter lipids - 50% rule
We use the following criteria 1 and 2 to select eligible lipids having reliable measurements.
* Criterion 1: Identify a specific type of lipid which has less than or equal to 50% (44 in count) patients who miss all 3 replicates of the lipid; we will keep this lipid in the dataset.
* Criterion 2: Identify a specific type of lipid whose inter-assay coefficient of variation (cv) are less than 0.2; we will keep this lipid in the dataset.
```{r eval=FALSE}
# Create a user-written function to select eligible lipids, using the frequency of patients who miss all 3 replicates of a lipid
select_lipid_OOR <- function(threshold_total_OOR) {
step = number_OOR = total_OOR = 0; type_OOR <- vector(length = 0)
for (type in c(2:length(lipid_data_full))) { # Loop over each lipid type
total_OOR = 0 # Clear total_OOR for next total_OOR
for (i in seq(from = 1, to = dim(lipid_data_full)[1], by = 3)) { # Loop over each group of triple replicates for a patient
while (step <= 2) {
if (lipid_data_full[i + step, type] == "OOR") {number_OOR = number_OOR + 1}
step = step + 1}
if (number_OOR == 3) {total_OOR <- total_OOR + 1}
step = number_OOR = 0 } # Clear step and number_OOR for next patient, under the same lipid column
if (total_OOR <= threshold_total_OOR) {type_OOR <- append(type_OOR, type)}}; return(type_OOR)} # Store eligible lipids in type_OOR
# Test the accuracy of the user-written function select_lipid()
# step = number_OOR = total_OOR = 0; type_OOR <- vector(length = 0)
# for (type in c(2:2)) { # Loop over the lipid 15:0:18:1 (d7) PC macro
# total_OOR = 0 # Clear total_OOR for next loop
# for (i in seq(from = 1, to = dim(lipid_data_full)[1], by = 3)) { # Loop over each group of triple replicates for a patient
# while (step <= 2) {
# if (lipid_data_full[i + step, type] == "OOR") {number_OOR = number_OOR + 1}
# step = step + 1}
# print(number_OOR)
# if (number_OOR == 3) {total_OOR <- total_OOR + 1}
# step = number_OOR = 0 } # Clear step and number_OOR for next patient, under the same lipid column
# print(total_OOR)
# if (total_OOR <= 80) {type_OOR <- append(type_OOR, type)}}
# Store the eligible lipid indices using the particular criterion
## Criterion 1: Identify a specific type of lipid which has less than or equal to 50% (44 in count) patients who miss all 3 replicates of the lipid; we will keep this lipid in the dataset.
lipid_50 <- select_lipid_OOR(threshold_total_OOR = 44)
## Create a lipid dataset containing sample id and eligible lipids
lipid_eligible <- lipid_data_full[, c(1, lipid_50)]
# Save eligible lipid data to disk
saveRDS(object = lipid_eligible, file = "lipid_eligible_50rule.RDS")
```
**Interpretation**:
* After filtering lipids using the criteria 1 and 2, 143 eligible lipids remain in our dataset.
* We exclude 87 metabolites that show missing values for 50% of participants [Cite PNAS].
```{r eval=FALSE}
# Transform OOR into NA
# Transform all character columns to numerical columns, except the column of sample id
library("naniar")
oor_fun <- function(dataset) {
newdata <- get(dataset) %>%
replace_with_na_all(condition = ~.x == "OOR") %>%
mutate(across(-c("Sample ID"), function(x) as.numeric(x))) %>%
mutate(across(-c("Sample ID"), function(x) x + 0.5))}
lipid_eligible <- oor_fun(dataset = "lipid_eligible")
# Save cleaned eligible lipid data to disk
saveRDS(object = lipid_eligible, file = "lipid_eligible.RDS")
```
## Imputation
```{r}
# Impute the missing portion of each lipid using the minimum value
lipid_eligible_MM <- lipid_eligible %>%
mutate(across(-c("Sample ID"), function(x) ifelse(is.na(x), min(x, na.rm = T), x)))
# Log transform for normalization
lipid_eligible_MM[, -1] <- log(lipid_eligible_MM[, -1])
```
## Relabel lipid species names
```{r}
label(lipid_eligible_MM$`Sample ID`) <- "Sample ID"
# SIL standards
label(lipid_eligible_MM$`15:0:18:1 (d7) PC macro`) <- "15:0/18:1(d7) PC (pos mode)"
label(lipid_eligible_MM$`17:0:14:1 PC macro`) <- "17:0/14:1 PC (pos mode)"
label(lipid_eligible_MM$`18:1 (d7) LPC macro.x`) <- "18:1(d7) LPC (pos mode)"
label(lipid_eligible_MM$`18:1 (d7) LPE macro.x`) <- "18:1(d7) LPE (pos mode)"
label(lipid_eligible_MM$`18:1 (d9) SM macro`) <- "18:1(d9) SM (pos mode)"
label(lipid_eligible_MM$`15:0-18:1_(d7)_PA macro`) <- "15:0/18:1(d7) PA (neg mode)"
label(lipid_eligible_MM$`15:0-18:1_(d7)_PE macro`) <- "15:0/18:1(d7) PE (neg mode)"
label(lipid_eligible_MM$`15:0-18:1_(d7)_PG macro`) <- "15:0/18:1(d7) PG (neg mode)"
label(lipid_eligible_MM$`15:0-18:1_(d7)_PI macro`) <- "15:0/18:1(d7) PI (neg mode)"
label(lipid_eligible_MM$`15:0-18:1_(d7)_PS macro`) <- "15:0/18:1(d7) PS (neg mode)"
label(lipid_eligible_MM$`18:1 (d7) LPC macro.y`) <- "18:1(d7) LPC (neg mode)"
label(lipid_eligible_MM$`18:1 (d7) LPE macro.y`) <- "18:1(d7) LPE (neg mode)"
# LPA
label(lipid_eligible_MM$`LPA(18:2)`) <- "LPA 18:2 (neg mode)"
# LPC
label(lipid_eligible_MM$`LPC(16:1)`) <- "LPC 16:1 (pos mode)"
label(lipid_eligible_MM$`LPC(18:0)`) <- "LPC 18:0 (pos mode)"
label(lipid_eligible_MM$`LPC(18:1)`) <- "LPC 18:1 (pos mode)"
label(lipid_eligible_MM$`LPC(18:2)`) <- "LPC 18:2 (pos mode)"
label(lipid_eligible_MM$`LPC(20:3)`) <- "LPC 20:3 (pos mode)"
label(lipid_eligible_MM$`LPC(20:4)`) <- "LPC 20:4 (pos mode)"
label(lipid_eligible_MM$`LPC(22:6)`) <- "LPC 22:6 (pos mode)"
label(lipid_eligible_MM$`LPC(O-16:0)`) <- "LPC O-16:0 (pos mode)"
label(lipid_eligible_MM$`LPC(O-18:0)`) <- "LPC O-18:0 (pos mode)"
label(lipid_eligible_MM$`LPC (16:0) ES-`) <- "LPC 16:0 (neg mode)"
label(lipid_eligible_MM$`LPC(18:0) ES-`) <- "LPC 18:0 (neg mode)"
label(lipid_eligible_MM$`LPC(18:2) ES-`) <- "LPC 18:2 (neg mode)"
label(lipid_eligible_MM$`LPC(20:4) ES-`) <- "LPC 20:4 (neg mode)"
# LPE
label(lipid_eligible_MM$`LPE(16:0)`) <- "LPE 16:0 (pos mode)"
label(lipid_eligible_MM$`LPE(16:1)`) <- "LPE 16:1 (pos mode)"
label(lipid_eligible_MM$`LPE(18:3)`) <- "LPE 18:3 (pos mode)"
label(lipid_eligible_MM$`LPE(20:1)`) <- "LPE 20:1 (pos mode)"
label(lipid_eligible_MM$`LPE(20:3)`) <- "LPE 20:3 (pos mode)"
label(lipid_eligible_MM$`LPE(20:4)`) <- "LPE 20:4 (pos mode)"
label(lipid_eligible_MM$`LPE(20:5)`) <- "LPE 20:5 (pos mode)"
label(lipid_eligible_MM$`LPE(22:4)`) <- "LPE 22:4 (pos mode)"
label(lipid_eligible_MM$`LPE(22:5)`) <- "LPE 22:5 (pos mode)"
label(lipid_eligible_MM$`LPE(22:6)`) <- "LPE 22:6 (pos mode)"
label(lipid_eligible_MM$`LPE(16:0) ES-`) <- "LPE 16:0 (neg mode)"
label(lipid_eligible_MM$`LPE(16:1) ES-`) <- "LPE 16:1 (neg mode)"
label(lipid_eligible_MM$`LPE(17:1) ES-`) <- "LPE 17:1 (neg mode)"
label(lipid_eligible_MM$`LPE(20:3) ES-`) <- "LPE 20:3 (neg mode)"
label(lipid_eligible_MM$`LPE(20:4) ES-`) <- "LPE 20:4 (neg mode)"
label(lipid_eligible_MM$`LPE(P-16:0) ES-`) <- "LPE P-16:0 (neg mode)"
label(lipid_eligible_MM$`LPE(P-18:0) ES-`) <- "LPE P-18:0 (neg mode)"
label(lipid_eligible_MM$`LPE(P-20:0) ES-`) <- "LPE P-20:0 (neg mode)"
label(lipid_eligible_MM$`LPE(P-18:1) ES-`) <- "LPE P-18:1 (neg mode)"
# LPI
label(lipid_eligible_MM$`LPI(18:0)`) <- "LPI 18:0 (neg mode)"
label(lipid_eligible_MM$`LPI(18:1)`) <- "LPI 18:1 (neg mode)"
label(lipid_eligible_MM$`LPI(18:2)`) <- "LPI 18:2 (neg mode)"
label(lipid_eligible_MM$`LPI(20:3)`) <- "LPI 20:3 (neg mode)"
label(lipid_eligible_MM$`LPI(20:4)`) <- "LPI 20:4 (neg mode)"
# PA
label(lipid_eligible_MM$`PA(16:0/16:0)`) <- "PA 16:0/16:0 (neg mode)"
label(lipid_eligible_MM$`PA(16:0/16:1)`) <- "PA 16:0/16:1 (neg mode)"
label(lipid_eligible_MM$`PA(16:0/18:0)`) <- "PA 16:0/18:0 (neg mode)"
label(lipid_eligible_MM$`PA(16:0/18:1)`) <- "PA 16:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PA(17:0/14:1)`) <- "PA 17:0/14:1 (neg mode)"
# PC
label(lipid_eligible_MM$`PC (O-16:0/18:0)`) <- "PC O-16:0/18:0 (pos mode)"
label(lipid_eligible_MM$`PC (O-16:0/18:3)`) <- "PC O-16:0/18:3 (pos mode)"
label(lipid_eligible_MM$`PC (O-16:0/20:1)`) <- "PC O-16:0/20:1 (pos mode)"
label(lipid_eligible_MM$`PC (O-16:0/20:4)`) <- "PC O-16:0/20:4 (pos mode)"
label(lipid_eligible_MM$`PC (O-18:0/16:0)`) <- "PC O-18:0/16:0 (pos mode)"
label(lipid_eligible_MM$`PC (O-18:0/18:0)`) <- "PC O-18:0/18:0 (pos mode)"
label(lipid_eligible_MM$`PC (O-18:0/18:1)`) <- "PC O-18:0/18:1 (pos mode)"
label(lipid_eligible_MM$`PC (P-16:0/18:2)`) <- "PC P-16:0/18:2 (pos mode)"
label(lipid_eligible_MM$`PC (P-16:0/20:0)`) <- "PC P-16:0/20:0 (pos mode)"
label(lipid_eligible_MM$`PC(16:0/18:0).x`) <- "PC 16:0/18:0 (pos mode)"
label(lipid_eligible_MM$`PC(16:1/18:2).x`) <- "PC 16:1/18:2 (pos mode)"
label(lipid_eligible_MM$`PC(18:0/18:0).x`) <- "PC 18:0/18:0 (pos mode)"
label(lipid_eligible_MM$`PC(O-18:1/18:0)`) <- "PC O-18:1/18:0 (pos mode)"
label(lipid_eligible_MM$`PC(O-18:2/16:1)`) <- "PC O-18:2/16:1 (pos mode)"
label(lipid_eligible_MM$`PC(P-18:0/18:0)`) <- "PC P-18:0/18:0 (pos mode)"
label(lipid_eligible_MM$`PC(P-18:0/18:3)`) <- "PC P-18:0/18:3 (pos mode)"
label(lipid_eligible_MM$`PC(P-18:2/18:1)`) <- "PC P-18:2/18:1 (pos mode)"
label(lipid_eligible_MM$`PC(P-18:2/18:2)`) <- "PC P-18:2/18:2 (pos mode)"
label(lipid_eligible_MM$`PC(14:0/16:0)`) <- "PC 14:0/16:0 (neg mode)"
label(lipid_eligible_MM$`PC(14:1/16:0)`) <- "PC 14:1/16:0 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/16:0).y`) <- "PC 16:0/16:0 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/16:1).y`) <- "PC 16:0/16:1 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/18:0).y`) <- "PC 16:0/18:0 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/18:3)`) <- "PC 16:0/18:3 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/20:2).y`) <- "PC 16:0/20:2 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/20:3).y`) <- "PC 16:0/20:3 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/20:4)`) <- "PC 16:0/20:4 (neg mode)"
label(lipid_eligible_MM$`PC(16:0/20:5)`) <- "PC 16:0/20:5 (neg mode)"
label(lipid_eligible_MM$`PC(16:1/18:0).y`) <- "PC 16:1/18:0 (neg mode)"
label(lipid_eligible_MM$`PC(16:1/18:1).y`) <- "PC 16:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PC(16:1/18:2).y`) <- "PC 16:1/18:2 (neg mode)"
label(lipid_eligible_MM$`PC(18:0/18:2).y`) <- "PC 18:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PC(18:0/20:3)`) <- "PC 18:0/20:3 (neg mode)"
label(lipid_eligible_MM$`PC(18:0/20:5)`) <- "PC 18:0/20:5 (neg mode)"
label(lipid_eligible_MM$`PC(18:0-22:5)`) <- "PC 18:0/22:5 (neg mode)"
label(lipid_eligible_MM$`PC(18:1/18:1)`) <- "PC 18:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PC(18:1/18:2).y`) <- "PC 18:1/18:2 (neg mode)"
label(lipid_eligible_MM$`PC(18:1/20:3)`) <- "PC 18:1/20:3 (neg mode)"
label(lipid_eligible_MM$`PC(18:2/18:2)`) <- "PC 18:2/18:2 (neg mode)"
label(lipid_eligible_MM$`PC(18:0/18:1).y`) <- "PC 18:0/18:1 (neg mode)"
# PE
label(lipid_eligible_MM$`PE(16:0/18:0)`) <- "PE 16:0/18:0 (neg mode)"
label(lipid_eligible_MM$`PE(16:0/18:1)`) <- "PE 16:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PE(16:0/18:2)`) <- "PE 16:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PE(16:0/20:3)`) <- "PE 16:0/20:3 (neg mode)"
label(lipid_eligible_MM$`PE(16:0/20:4)`) <- "PE 16:0/20:4 (neg mode)"
label(lipid_eligible_MM$`PE(16:1/18:1)`) <- "PE 16:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PE(16:1/18:2)`) <- "PE 16:1/18:2 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/18:0)`) <- "PE 18:0/18:0 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/18:1)`) <- "PE 18:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/18:2)`) <- "PE 18:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/18:3)`) <- "PE 18:0/18:3 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/20:3)`) <- "PE 18:0/20:3 (neg mode)"
label(lipid_eligible_MM$`PE(18:0/20:4)`) <- "PE 18:0/20:4 (neg mode)"
label(lipid_eligible_MM$`PE(18:1/18:1)`) <- "PE 18:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PE(18:1/18:2)`) <- "PE 18:1/18:2 (neg mode)"
label(lipid_eligible_MM$`PE(18:1/20:3)`) <- "PE 18:1/20:3 (neg mode)"
label(lipid_eligible_MM$`PE(18:1/20:4)`) <- "PE 18:1/20:4 (neg mode)"
# PG
label(lipid_eligible_MM$`PG(16:0/18:1)`) <- "PG 16:0/18:1 (neg mode)"
# PI
label(lipid_eligible_MM$`PI(16:0/16:1)`) <- "PI 16:0/16:1 (neg mode)"
label(lipid_eligible_MM$`PI(16:0/18:1)`) <- "PI 16:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PI(16:0/18:2)`) <- "PI 16:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PI(16:1/18:0)`) <- "PI 16:1/18:0 (neg mode)"
label(lipid_eligible_MM$`PI(18:0/18:1)`) <- "PI 18:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PI(18:0/18:2)`) <- "PI 18:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PI(18:0/20:2)`) <- "PI 18:0/20:2 (neg mode)"
label(lipid_eligible_MM$`PI(18:0/20:3)`) <- "PI 18:0/20:3 (neg mode)"
label(lipid_eligible_MM$`PI(18:1/18:1)`) <- "PI 18:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PI(18:1/18:2)`) <- "PI 18:1/18:2 (neg mode)"
# PS
label(lipid_eligible_MM$`PS(18:0/18:2)`) <- "PS 18:0/18:2 (neg mode)"
label(lipid_eligible_MM$`PS(18:1/18:1)`) <- "PS 18:1/18:1 (neg mode)"
label(lipid_eligible_MM$`PS(18:0/18:1)`) <- "PS 18:0/18:1 (neg mode)"
label(lipid_eligible_MM$`PS(16:1/18:0)`) <- "PS 16:1/18:0 (neg mode)"
# SM
label(lipid_eligible_MM$`SM(d18:1/14:0)`) <- "SM d18:1/14:0 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/14:1)`) <- "SM d18:1/14:1 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/16:1)`) <- "SM d18:1/16:1 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/17:0)`) <- "SM d18:1/17:0 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/18:0)`) <- "SM d18:1/18:0 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/18:1)`) <- "SM d18:1/18:1 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/18:2)`) <- "SM d18:1/18:2 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/20:0)`) <- "SM d18:1/20:0 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/20:1)`) <- "SM d18:1/20:1 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/22:0)`) <- "SM d18:1/22:0 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/22:1)`) <- "SM d18:1/22:1 (pos mode)"
label(lipid_eligible_MM$`SM(d18:1/22:2)`) <- "SM d18:1/22:2 (pos mode)"
# SPH
label(lipid_eligible_MM$`Sphinganine (18:0)`) <- "Sphinganine 18:0 (pos mode)"
label(lipid_eligible_MM$`Sphingosine d18:1`) <- "Sphingosine d18:1 (pos mode)"
# S1P
label(lipid_eligible_MM$`Sphingosine-1:-Phosphate`) <- "Sphingosine-1-Phosphate (pos mode)"
# Check labels
# label(lipid_eligible_MM)
```
## Coefficient of variation
```{r}
# Calculate the coefficient of variation (CV, sample standard deviation / sample mean)
lipid_data_cv <- as.data.frame(matrix(data = NA,
nrow = 88,
ncol = length(names(lipid_eligible_MM))))
colnames(lipid_data_cv) <- label(lipid_eligible_MM)
patient_index = 0
for (type in c(2:length(lipid_eligible_MM))) { # Loop over each lipid type
patient_index = 0 # Clear patient index for next lipid
for (i in seq(from = 1, to = dim(lipid_eligible_MM)[1], by = 3)) { # Loop over each patient
patient_index <- patient_index + 1
lipid_data_cv[patient_index, type] = sd(c(pull(lipid_eligible_MM[i, type])[1],
pull(lipid_eligible_MM[i + 1, type])[1],
pull(lipid_eligible_MM[i + 2, type])[1]))/
mean(c(pull(lipid_eligible_MM[i, type])[1],
pull(lipid_eligible_MM[i + 1, type])[1],
pull(lipid_eligible_MM[i + 2, type]))[1])
}
}
lipid_data_cv <- lipid_data_cv %>% mutate(`Sample ID` = unique(lipid_eligible_MM$`Sample ID`))
# Show the CV for each lipid species across a total of 88 patients
lipid_mean_cv <- lipid_data_cv %>%
mutate(across(-c("Sample ID"), abs)) %>%
summarise(across(-c("Sample ID"), list(mean_cv = mean))) %>%
pivot_longer(cols = everything(), names_to = "Lipid Species", values_to = "Mean CV")
# Make a line plot (JCI supplementary figure 3)
lipid_mean_cv$group <- c(rep("SIL standard", 5),
rep("LPC", 8),
rep("LPE", 7),
rep("PC", 12),
rep("SM", 10),
rep("SIL standard", 7),
"LPA",
rep("LPC", 3),
rep("LPE", 7),
rep("LPI", 5),
rep("PA", 5),
rep("PC", 20),
rep("PE", 14),
rep("PI", 10),
rep("PS", 2),
rep("SPH", 2),
rep("S1P", 1))
lipid_mean_cv$`Lipid Species` <- colnames(lipid_data_cv)[-1] # Reassign the lipid species names
# Set the pallete
library(RColorBrewer)
getPalette <- colorRampPalette(brewer.pal(12, "Paired"))
mean_cv_plot <- ggplot(lipid_mean_cv, aes(x = `Lipid Species`, y = `Mean CV`, group = group)) +
geom_line(aes(color = group)) +
geom_point(aes(color = group)) +
geom_hline(yintercept = 0.2, linetype = "dashed", color = "red3") +
scale_y_continuous(breaks = c(0, 0.2, 0.5, 1.0), labels = c("0.0","0.2","0.5","1.0")) +
scale_color_manual(values = getPalette(length(unique(lipid_mean_cv$group)))) +
theme_bw() +
theme(legend.position = "bottom",
panel.grid.major = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 4, angle = 90, hjust = 1, color = "black"),
axis.text.y = element_text(color = c("black", "red3", "black", "black"))) +
labs(color = "Lipid Class")
mean_cv_plot
ggsave("mean_cv_plot_MM.jpeg", width = 25, height = 12, units = "cm")
(197-14 + 1)/197
(197 - 3)/197
```
**Interpretation**: Except 15:0/18:1(d7) PS (neg mode), Sphingosine-1-Phosphate (pos mode), 17:0/14:1 PC (pos mode), all the remaining lipids have the mean CVs < 20% across a total of 88 patients
## Filter lipids - CV < 0.2
```{r eval=FALSE}
# Exclude 15:0-18:1_(d7)_PS macro
drop_lipid <- names(lipid_eligible_MM) %in% c("15:0-18:1_(d7)_PS macro", "Sphingosine-1:-Phosphate", "17:0:14:1 PC macro")
lipid_ffa_MM_final <- lipid_eligible_MM[!drop_lipid]
# Extract lipid species labels
lipid_label <- label(lipid_ffa_MM_final)
# Take the average of 3 replicates for each type of lipids per patient
lipid_ffa_MM_final <- lipid_ffa_MM_final %>% group_by(`Sample ID`) %>% summarise_all(mean)
# Label the variables at once
names(lipid_label) <- NULL
label(lipid_ffa_MM_final) <- as.list(lipid_label)
```
**Summary**:
* Using the criteria 1 and 2, we identify 139 types of lipid species which have 1) $\leq$ 50% patients who miss all 3 replicates and 2) mean CV < 20%.
## Integrate the final lipid dataset with the clinical dataset
```{r eval=FALSE, warning=FALSE}
# Extract unique sample id
lipid_sample_id <- unique(lipid_ffa_MM_final$`Sample ID`)
# Load and filter the clinical dataset using sample id affiliated to the lipid dataset
clinical_data <- read_csv("dat_DEXA_qPCR_Blood_20190711.csv") %>%
rename(`Sample ID` = StudyID) %>%
filter(`Sample ID` %in% lipid_sample_id)
# Merge the clinical dataset with the final lipid dataset
data_full <- merge(x = clinical_data,
y = lipid_ffa_MM_final,
by.x = "Sample ID",
by.y = "Sample ID")
```
## Standardize lipid species names [outdated]
```{r}
# Relabel lipid species names in accordance to standardized shorthand notation for lipid structures from the Journal of Lipid Research
# SIL standards
label(lipid_ffa_MM_final$`15:0:18:1 (d7) PC macro`) <- "15:0/18:1(d7) PC (pos mode)"
label(lipid_ffa_MM_final$`18:1 (d7) LPC macro.x`) <- "18:1(d7) LPC (pos mode)"
label(lipid_ffa_MM_final$`18:1 (d7) LPE macro.x`) <- "18:1(d7) LPE (pos mode)"
label(lipid_ffa_MM_final$`18:1 (d9) SM macro`) <- "18:1(d9) SM (pos mode)"
label(lipid_ffa_MM_final$`15:0-18:1_(d7)_PA macro`) <- "15:0/18:1(d7) PA (neg mode)"
label(lipid_ffa_MM_final$`15:0-18:1_(d7)_PE macro`) <- "15:0/18:1(d7) PE (neg mode)"
label(lipid_ffa_MM_final$`15:0-18:1_(d7)_PG macro`) <- "15:0/18:1(d7) PG (neg mode)"
label(lipid_ffa_MM_final$`15:0-18:1_(d7)_PI macro`) <- "15:0/18:1(d7) PI (neg mode)"
label(lipid_ffa_MM_final$`18:1 (d7) LPC macro.y`) <- "18:1(d7) LPC (neg mode)"
label(lipid_ffa_MM_final$`18:1 (d7) LPE macro.y`) <- "18:1(d7) LPE (neg mode)"
# LPA
label(lipid_ffa_MM_final$`LPA(18:2)`) <- "LPA 18:2 (neg mode)"
# LPC
label(lipid_ffa_MM_final$`LPC(16:1)`) <- "LPC 16:1 (pos mode)"
label(lipid_ffa_MM_final$`LPC(18:0)`) <- "LPC 18:0 (pos mode)"
label(lipid_ffa_MM_final$`LPC(18:1)`) <- "LPC 18:1 (pos mode)"
label(lipid_ffa_MM_final$`LPC(18:2)`) <- "LPC 18:2 (pos mode)"
label(lipid_ffa_MM_final$`LPC(20:3)`) <- "LPC 20:3 (pos mode)"
label(lipid_ffa_MM_final$`LPC(20:4)`) <- "LPC 20:4 (pos mode)"
label(lipid_ffa_MM_final$`LPC(22:6)`) <- "LPC 22:6 (pos mode)"
label(lipid_ffa_MM_final$`LPC(O-18:0)`) <- "LPC O-18:0 (pos mode)"
label(lipid_ffa_MM_final$`LPC (16:0) ES-`) <- "LPC 16:0 (neg mode)"
label(lipid_ffa_MM_final$`LPC(18:0) ES-`) <- "LPC 18:0 (neg mode)"
label(lipid_ffa_MM_final$`LPC(18:2) ES-`) <- "LPC 18:2 (neg mode)"
# LPE
label(lipid_ffa_MM_final$`LPE(16:1)`) <- "LPE 16:1 (pos mode)"
label(lipid_ffa_MM_final$`LPE(18:3)`) <- "LPE 18:3 (pos mode)"
label(lipid_ffa_MM_final$`LPE(20:3)`) <- "LPE 20:3 (pos mode)"
label(lipid_ffa_MM_final$`LPE(20:4)`) <- "LPE 20:4 (pos mode)"
label(lipid_ffa_MM_final$`LPE(20:5)`) <- "LPE 20:5 (pos mode)"
label(lipid_ffa_MM_final$`LPE(22:5)`) <- "LPE 22:5 (pos mode)"
label(lipid_ffa_MM_final$`LPE(22:6)`) <- "LPE 22:6 (pos mode)"
label(lipid_ffa_MM_final$`LPE(17:1) ES-`) <- "LPE 17:1 (neg mode)"
label(lipid_ffa_MM_final$`LPE(20:3) ES-`) <- "LPE 20:3 (neg mode)"
label(lipid_ffa_MM_final$`LPE(20:4) ES-`) <- "LPE 20:4 (neg mode)"
label(lipid_ffa_MM_final$`LPE(P-16:0) ES-`) <- "LPE P-16:0 (neg mode)"
label(lipid_ffa_MM_final$`LPE(P-18:0) ES-`) <- "LPE P-18:0 (neg mode)"
label(lipid_ffa_MM_final$`LPE(P-20:0) ES-`) <- "LPE P-20:0 (neg mode)"
# LPI
label(lipid_ffa_MM_final$`LPI(18:0)`) <- "LPI 18:0 (neg mode)"
label(lipid_ffa_MM_final$`LPI(18:1)`) <- "LPI 18:1 (neg mode)"
label(lipid_ffa_MM_final$`LPI(18:2)`) <- "LPI 18:2 (neg mode)"
label(lipid_ffa_MM_final$`LPI(20:3)`) <- "LPI 20:3 (neg mode)"
label(lipid_ffa_MM_final$`LPI(20:4)`) <- "LPI 20:4 (neg mode)"
# PA
label(lipid_ffa_MM_final$`PA(16:0/16:0)`) <- "PA 16:0/16:0 (neg mode)"
label(lipid_ffa_MM_final$`PA(16:0/16:1)`) <- "PA 16:0/16:1 (neg mode)"
label(lipid_ffa_MM_final$`PA(16:0/18:0)`) <- "PA 16:0/18:0 (neg mode)"
label(lipid_ffa_MM_final$`PA(16:0/18:1)`) <- "PA 16:0/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PA(17:0/14:1)`) <- "PA 17:0/14:1 (neg mode)"
# PC
label(lipid_ffa_MM_final$`PC (O-16:0/18:0)`) <- "PC O-16:0/18:0 (pos mode)"
label(lipid_ffa_MM_final$`PC (O-16:0/20:1)`) <- "PC O-16:0/20:1 (pos mode)"
label(lipid_ffa_MM_final$`PC (O-16:0/20:4)`) <- "PC O-16:0/20:4 (pos mode)"
label(lipid_ffa_MM_final$`PC (O-18:0/16:0)`) <- "PC O-18:0/16:0 (pos mode)"
label(lipid_ffa_MM_final$`PC (O-18:0/18:1)`) <- "PC O-18:0/18:1 (pos mode)"
label(lipid_ffa_MM_final$`PC (P-16:0/20:0)`) <- "PC P-16:0/20:0 (pos mode)"
label(lipid_ffa_MM_final$`PC(16:1/18:2).x`) <- "PC 16:1/18:2 (pos mode)"
label(lipid_ffa_MM_final$`PC(18:0/18:0).x`) <- "PC 18:0/18:0 (pos mode)"
label(lipid_ffa_MM_final$`PC(O-18:1/18:0)`) <- "PC O-18:1/18:0 (pos mode)"
label(lipid_ffa_MM_final$`PC(P-18:0/18:0)`) <- "PC P-18:0/18:0 (pos mode)"
label(lipid_ffa_MM_final$`PC(P-18:0/18:3)`) <- "PC P-18:0/18:3 (pos mode)"
label(lipid_ffa_MM_final$`PC(P-18:2/18:1)`) <- "PC P-18:2/18:1 (pos mode)"
label(lipid_ffa_MM_final$`PC(14:0/16:0)`) <- "PC 14:0/16:0 (neg mode)"
label(lipid_ffa_MM_final$`PC(14:1/16:0)`) <- "PC 14:1/16:0 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/16:0).y`) <- "PC 16:0/16:0 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/16:1).y`) <- "PC 16:0/16:1 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/18:0).y`) <- "PC 16:0/18:0 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/18:3)`) <- "PC 16:0/18:3 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/20:2).y`) <- "PC 16:0/20:2 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/20:3).y`) <- "PC 16:0/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/20:4)`) <- "PC 16:0/20:4 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:0/20:5)`) <- "PC 16:0/20:5 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:1/18:1).y`) <- "PC 16:1/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PC(16:1/18:2).y`) <- "PC 16:1/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:0/20:3)`) <- "PC 18:0/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:0/20:5)`) <- "PC 18:0/20:5 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:0-22:5)`) <- "PC 18:0/22:5 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:1/18:1)`) <- "PC 18:1/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:1/18:2).y`) <- "PC 18:1/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:1/20:3)`) <- "PC 18:1/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PC(18:2/18:2)`) <- "PC 18:2/18:2 (neg mode)"
# PE
label(lipid_ffa_MM_final$`PE(16:0/18:0)`) <- "PE 16:0/18:0 (neg mode)"
label(lipid_ffa_MM_final$`PE(16:0/18:1)`) <- "PE 16:0/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PE(16:0/18:2)`) <- "PE 16:0/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PE(16:0/20:3)`) <- "PE 16:0/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PE(16:0/20:4)`) <- "PE 16:0/20:4 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:0/18:0)`) <- "PE 18:0/18:0 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:0/18:1)`) <- "PE 18:0/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:0/18:2)`) <- "PE 18:0/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:0/20:3)`) <- "PE 18:0/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:0/20:4)`) <- "PE 18:0/20:4 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:1/18:1)`) <- "PE 18:1/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:1/18:2)`) <- "PE 18:1/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:1/20:3)`) <- "PE 18:1/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PE(18:1/20:4)`) <- "PE 18:1/20:4 (neg mode)"
# PI
label(lipid_ffa_MM_final$`PI(16:0/16:1)`) <- "PI 16:0/16:1 (neg mode)"
label(lipid_ffa_MM_final$`PI(16:0/18:1)`) <- "PI 16:0/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PI(16:0/18:2)`) <- "PI 16:0/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PI(16:1/18:0)`) <- "PI 16:1/18:0 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:0/18:1)`) <- "PI 18:0/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:0/18:2)`) <- "PI 18:0/18:2 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:0/20:2)`) <- "PI 18:0/20:2 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:0/20:3)`) <- "PI 18:0/20:3 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:1/18:1)`) <- "PI 18:1/18:1 (neg mode)"
label(lipid_ffa_MM_final$`PI(18:1/18:2)`) <- "PI 18:1/18:2 (neg mode)"
# SM
label(lipid_ffa_MM_final$`SM(d18:1/14:0)`) <- "SM d18:1/14:0 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/16:1)`) <- "SM d18:1/16:1 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/17:0)`) <- "SM d18:1/17:0 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/18:0)`) <- "SM d18:1/18:0 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/18:1)`) <- "SM d18:1/18:1 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/20:0)`) <- "SM d18:1/20:0 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/20:1)`) <- "SM d18:1/20:1 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/22:0)`) <- "SM d18:1/22:0 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/22:1)`) <- "SM d18:1/22:1 (pos mode)"
label(lipid_ffa_MM_final$`SM(d18:1/22:2)`) <- "SM d18:1/22:2 (pos mode)"
# SPH
label(lipid_ffa_MM_final$`Sphinganine (18:0)`) <- "Sphinganine 18:0 (pos mode)"
label(lipid_ffa_MM_final$`Sphingosine d18:1`) <- "Sphingosine d18:1 (pos mode)"
```
# Data preprocess for free fatty acids and bile acids
## Import data
```{r}
ffa_data <- readxl::read_xlsx("MSK-WCMC Data FFA_Response_3MAR21.xlsx",
sheet = "FFA_Area_Response_Raw",
range = c("A2:BG266"),
col_names = TRUE) %>%
rename("Sample ID" = "...1") %>%
select(-c("...2", "Row Labels", "SampleList.File Text"))
```
## Relabel analytes
```{r}
label(ffa_data$`Sample ID`) <- "Sample ID"
## Bile acids
label(ffa_data$`Chenodeoxycholic Acid`) <- "Chenodeoxycholic acid"
label(ffa_data$`Cholic Acid`) <- "Cholic acid"
label(ffa_data$`Deoxycholic Acid`) <- "Deoxycholic acid"
label(ffa_data$`Glycochenodeoxycholic acid`) <- "Glycochenodeoxycholic acid"
label(ffa_data$`Glycocholic Acid`) <- "Glycocholic acid"
label(ffa_data$`Glycodeoxycholic acid`) <- "Glycodeoxycholic acid"
label(ffa_data$`Glycolithocholic Acid`) <- "Glycolithocholic acid"
label(ffa_data$`Glycoursodeoxycholic acid`) <- "Glycoursodeoxycholic acid"
label(ffa_data$`Lithocholic Acid`) <- "Lithocholic acid"
label(ffa_data$`Taurochenodeoxycholic acid`) <- "Taurochenodeoxycholic acid"
label(ffa_data$`Taurocholic Acid`) <- "Taurocholic acid"
label(ffa_data$`Taurocholic Acid (isobaric peak 1)`) <- "Taurocholic acid (isobaric peak 1)"
label(ffa_data$`Taurocholic Acid (isobaric peak 2)`) <- "Taurocholic acid (isobaric peak 2)"
label(ffa_data$`Taurocholic Acid (isobaric peak 3)`) <- "Taurocholic acid (isobaric peak 3)"
label(ffa_data$`Taurodeoxycholic acid`) <- "Taurodeoxycholic acid"
label(ffa_data$`Taurodeoxycholic acid (isobaric peak 1)`) <- "Taurodeoxycholic acid (isobaric peak 1)"
label(ffa_data$`Taurolithocholic acid`) <- "Taurolithocholic acid"
label(ffa_data$`Tauroursodeoxycholic acid`) <- "Tauroursodeoxycholic acid"
label(ffa_data$`Ursodeoxycholic Acid`) <- "Ursodeoxycholic acid"
## Free fatty acid
label(ffa_data$`FFA 24_0`) <- "FA 24:0"
label(ffa_data$`FFA 24_1`) <- "FA 24:1"
label(ffa_data$`FFA 24_2`) <- "FA 24:2"
label(ffa_data$`FFA 24_3`) <- "FA 24:3"
label(ffa_data$`FFA 24_4`) <- "FA 24:4"
label(ffa_data$`FFA 24_5`) <- "FA 24:5"
label(ffa_data$`FFA_10_0`) <- "FA 10:0"
label(ffa_data$`FFA_10_1`) <- "FA 10:1"
label(ffa_data$`FFA_10_2`) <- "FA 10:2"
label(ffa_data$`FFA_12_0`) <- "FA 12:0"
label(ffa_data$`FFA_12_1`) <- "FA 12:1"
label(ffa_data$`FFA_14_0`) <- "FA 14:0"
label(ffa_data$`FFA_14_1`) <- "FA 14:1"
label(ffa_data$`FFA_16_0`) <- "FA 16:0"
label(ffa_data$`FFA_16_1`) <- "FA 16:1"
label(ffa_data$`FFA_16_2`) <- "FA 16:2"
label(ffa_data$`FFA_17_0`) <- "FA 17:0"
label(ffa_data$`FFA_17_1`) <- "FA 17:1"
label(ffa_data$`FFA_18_0`) <- "FA 18:0"
label(ffa_data$`FFA_18_1`) <- "FA 18:1"
label(ffa_data$`FFA_18_2`) <- "FA 18:2"
label(ffa_data$`FFA_18_3`) <- "FA 18:3"
label(ffa_data$`FFA_18_4`) <- "FA 18:4"
label(ffa_data$`FFA_20_0`) <- "FA 20:0"
label(ffa_data$`FFA_20_1`) <- "FA 20:1"
label(ffa_data$`FFA_20_2`) <- "FA 20:2"
label(ffa_data$`FFA_20_3`) <- "FA 20:3"
label(ffa_data$`FFA_20_4`) <- "FA 20:4"
label(ffa_data$`FFA_20_5`) <- "FA 20:5"
label(ffa_data$`FFA_22_0`) <- "FA 22:0"
label(ffa_data$`FFA_22_1`) <- "FA 22:1"
label(ffa_data$`FFA_22_2`) <- "FA 22:2"
label(ffa_data$`FFA_22_3`) <- "FA 22:3"
label(ffa_data$`FFA_22_4`) <- "FA 22:4"
label(ffa_data$`FFA_22_5`) <- "FA 22:5"
label(ffa_data$`FFA_22_6`) <- "FA 22:6"
```
## Filter analytes - 50% rule
We use the following criteria 1 and 2 to select eligible analytes having reliable measurements.
* Criterion 1: Identify a specific type of analyte which has less than or equal to 50% (44 in count) patients who miss all 3 replicates of the analyte; we will keep this analyte in the dataset.
* Criterion 2: Identify a specific type of analyte which has less than or equal to 50% of patients whose inter-assay coefficient of variation (cv) are less than 0.2; we will keep this analyte in the dataset.
```{r eval=FALSE}
# Create a user-written function to select eligible lipids, using the frequency of patients who miss all 3 replicates of a lipid
select_ffa_OOR <- function(threshold_total_OOR) {
step = number_OOR = total_OOR = 0; type_OOR <- vector(length = 0)
for (type in c(2:length(ffa_data))) { # Loop over each ffa type
total_OOR = 0 # Clear total_OOR for next total_OOR
for (i in seq(from = 1, to = dim(ffa_data)[1], by = 3)) { # Loop over each group of triple replicates for a patient
while (step <= 2) {
if (ffa_data[i + step, type] == "OOR_L") {number_OOR = number_OOR + 1}
step = step + 1}
if (number_OOR == 3) {total_OOR <- total_OOR + 1}
step = number_OOR = 0 } # Clear step and number_OOR for next patient, under the same ffa column
if (total_OOR <= threshold_total_OOR) {type_OOR <- append(type_OOR, type)}}; return(type_OOR)} # Store eligible ffa in type_OOR
# Test the accuracy of the user-written function select_lipid()
# step = number_OOR = total_OOR = 0; type_OOR <- vector(length = 0)
# for (type in c(10:10)) { # Loop over the lipid 15:0:18:1 (d7) PC macro
# total_OOR = 0 # Clear total_OOR for next loop
# for (i in seq(from = 1, to = dim(ffa_data)[1], by = 3)) { # Loop over each group of triple replicates for a patient
# while (step <= 2) {
# if (ffa_data[i + step, type] == "OOR_L") {number_OOR = number_OOR + 1}
# step = step + 1}
# print(number_OOR)
# if (number_OOR == 3) {total_OOR <- total_OOR + 1}
# step = number_OOR = 0 } # Clear step and number_OOR for next patient, under the same lipid column
# print(total_OOR)
# if (total_OOR <= 44) {type_OOR <- append(type_OOR, type)}}
# Store the eligible lipid indices using the particular criterion
## Criterion 1: Identify a specific type of lipid which has less than or equal to 50% (44 in count) patients who miss all 3 replicates of the lipid; we will keep this lipid in the dataset.
ffa_50 <- select_ffa_OOR(threshold_total_OOR = 44)
## Create a analyte dataset containing sample id and eligible analytes
ffa_eligible <- ffa_data[, c(1, ffa_50)]
# Save eligible lipid data to disk
saveRDS(object = ffa_eligible, file = "ffa_eligible_50rule.RDS")
```
**Interpretation**:
* After filtering analytes using the criteria 1, 55 eligible analytes remain in our dataset.
```{r eval=FALSE}
# Transform OOR_L into NA
# Transform all character columns to numerical columns, except the column of sample id
library("naniar")
oor_fun <- function(dataset) {
newdata <- get(dataset) %>%
replace_with_na_all(condition = ~.x == "OOR_L") %>%
mutate(across(-c("Sample ID"), function(x) as.numeric(x))) %>%
mutate(across(-c("Sample ID"), function(x) x + 0.5))}
ffa_eligible <- oor_fun(dataset = "ffa_eligible")
# Save cleaned eligible lipid data to disk
saveRDS(object = ffa_eligible, file = "ffa_eligible.RDS")
```
## Imputation
```{r}
# Impute the missing portion of each lipid using the minimum value
ffa_eligible_MM <- ffa_eligible %>%
mutate(across(-c("Sample ID"), function(x) ifelse(is.na(x), min(x, na.rm = T), x)))
# Perform the log transform for normalization
ffa_eligible_MM[, -1] <- log(ffa_eligible_MM[, -1])
```
## Coefficient of variation
```{r}
# Calculate the coefficient of variation (CV, sample standard deviation / sample mean)
ffa_data_cv <- as.data.frame(matrix(data = NA,
nrow = 88,
ncol = ncol(ffa_eligible_MM)))
colnames(ffa_data_cv) <- label(ffa_data)
patient_index = 0
for (type in c(2:length(ffa_eligible_MM))) { # Loop over each ffa type
patient_index = 0 # Clear patient index for next ffa
for (i in seq(from = 1, to = dim(ffa_eligible_MM)[1], by = 3)) { # Loop over each patient
patient_index <- patient_index + 1
ffa_data_cv[patient_index, type] = sd(c(pull(ffa_eligible_MM[i, type])[1],
pull(ffa_eligible_MM[i + 1, type])[1],
pull(ffa_eligible_MM[i + 2, type])[1]))/
mean(c(pull(ffa_eligible_MM[i, type])[1],
pull(ffa_eligible_MM[i + 1, type])[1],
pull(ffa_eligible_MM[i + 2, type]))[1])
}
}
ffa_data_cv <- ffa_data_cv %>%
mutate(`Sample ID` = unique(ffa_eligible_MM$`Sample ID`))
# Show the CV for each ffa species across a total of 88 patients
ffa_mean_cv <- ffa_data_cv %>%
mutate(across(-c("Sample ID"), abs)) %>%
summarise(across(-c("Sample ID"), list(mean_cv = mean))) %>%
pivot_longer(cols = everything(), names_to = "Free Fatty Acids and Bile Acids", values_to = "Mean CV")
```
### CV for all lipid analytes
```{r}
# Make a line plot (JCI supplementary figure 3)
## Merge lipid_mean_cv with ffa_mean_cv
colnames(ffa_mean_cv)[1] <- "Lipid Species"
mean_cv_all <- rbind(lipid_mean_cv, ffa_mean_cv)
# Count the frequency of each lipid class
# str_starts(mean_cv_all$`Lipid Species`, "LPC", negate = FALSE)
mean_cv_all$Group <- c(rep("SIL standard", 5),
rep("LPC", 9),
rep("LPE", 10),
rep("PC", 18),
rep("SM", 12),
rep("SIL standard", 7),
"LPA",
rep("LPC", 4),
rep("LPE", 9),
rep("LPI", 5),
rep("PA", 5),
rep("PC", 22),
rep("PE", 17),
"PG",
rep("PI", 10),
rep("PS", 4),
rep("SPH", 2),
rep("S1P", 1),
rep("BA", 3),
rep("FA", 36),
rep("BA", 16))
mean_cv_all$`Lipid Species` <- c(colnames(lipid_data_cv)[-1], colnames(ffa_data_cv)[-1]) # Reassign the species names
# Set the pallete
library(RColorBrewer)
getPalette <- colorRampPalette(brewer.pal(16, "Paired"))
mean_cv_all_sort <- mean_cv_all %>%
group_by(Group) %>%
arrange("Lipid Sepcies", Group)
mean_cv_plot <- mean_cv_all_sort %>%
ggplot(aes(x = factor(`Lipid Species`, levels = `Lipid Species`), y = `Mean CV`, group = Group)) + # Data Science I Lab 6
geom_line(aes(color = Group)) +
geom_point(aes(color = Group)) +
geom_hline(yintercept = 0.2, linetype = "dashed", color = "red3") +
scale_y_continuous(breaks = c(0, 0.2, 0.5, 1.0), labels = c("0.0","0.2","0.5","1.0")) +
scale_color_manual(values = getPalette(length(unique(mean_cv_all$Group)))) +
theme_bw() +
theme(legend.position = "bottom",
panel.grid.major = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5, angle = 90, hjust = 1, color = "black"),
axis.text.y = element_text(color = c("black", "red3", "black", "black"))) +
labs(color = "Lipid Class",
x = "Lipid Analyte",
y = "Mean Coefficient of Variation")
mean_cv_plot
ggsave("mean_cv_plot_MM_Oct1.jpeg", width = 32, height = 15, units = "cm")
(197-14 + 1)/197
(197 - 3)/197
```
**Interpretation**:
* 93% of the lipid analyte had mean CVs of less than 10%.
* 98% of the lipid analyte had CVs of less than 20%.
## Further steps
```{r eval=FALSE}
# Extract lipid species labels
ffa_label <- label(ffa_data)
# Take the average of 3 replicates for each type of lipids per patient
ffa_eligible_MM_final <- ffa_eligible_MM %>%
group_by(`Sample ID`) %>%
summarise_all(mean)
# Label the variables at once
names(ffa_label) <- NULL
label(ffa_eligible_MM_final) <- as.list(ffa_label)
# Merge the final ffa data with the final lipid data
lipid_ffa_MM_final <- merge(x = lipid_ffa_MM_final,
y = ffa_eligible_MM_final,
by.x = "Sample ID",
by.y = "Sample ID")
# Merge the final ffa data with the final lipid and clinical data
data_full <- merge(x = data_full,
y = ffa_eligible_MM_final,
by.x = "Sample ID",
by.y = "Sample ID")
# Save cleaned data to disk
saveRDS(object = lipid_ffa_MM_final, file = "lipid_ffa.RDS")
saveRDS(object = data_full, file = "lipid_clinical_ffa.RDS")
```
**Summary**: Using the criteria 1 and 2, we identify 55 types of free fatty acids and bile acies which have 1) $\leq$ 50% patients who miss all 3 replicates and 2) mean CV < 20%.
# Data preprocess for transcriptomics
## Import FPKM data
```{r}
# Load and wrangle FPKM dataset
FPKM_origin <- read_csv("RNASeqReport_Dec28_2018_FPKM.csv")
# Check the original number of patients in the FPKM dataset
ncol(FPKM_origin) - 3 # There are 93 patients
# Check the original number of genes in the FPKM dataset
nrow(FPKM_origin) # There are 20422 genes
# Clean FPKM dataset
FPKM_data <- FPKM_origin %>%
select(-c(`...1`, Entrez.ID)) %>%
t() %>% # Gene name as column names, and sample id as row names
janitor::row_to_names(., row_number = 1) %>% # Transform first-row values as column names
data.frame(check.names = FALSE) %>%
mutate(across(everything(), function(x) as.numeric(x)))
# Extract the FPKM sample id and check the updated number of patients
FPKM_sample_id <- rownames(FPKM_data)
# length(FPKM_sample_id) # There are 93 patients
# Find the common sample id between the FPKM dataset and lipidomics dataset
sample_id_intersect <- intersect(FPKM_sample_id, lipid_sample_id)
# Use the common sample id to subset the transcriptomics dataset
FPKM_data_2 <- FPKM_data[sample_id_intersect, ]
# Check the updated number of patients
nrow(FPKM_data_2) # There are 81 patients
# Save cleaned data to disk
saveRDS(object = FPKM_data_2, file = "FPKM_data_2.RDS")
```
* Now we have a total of 81 patients who have both eligible transcriptomics and lipid analytes information.
## Check the distribution of gene expression in FPKM
### First 9 genes
```{r}
# Create a batch histogram plotting function
histogram <- function(gene_name, dataset){
ggplot(data = get(dataset),
mapping = aes(x = get(gene_name))) +
geom_histogram(bins = 30, fill = "#B31B1B") +
theme_bw() +
labs(
y = "Count",
x = paste0(gene_name, " FPKM")) +
theme(plot.title = element_text(hjust = 0.5))
}
# Plot the distribution of gene expression levels of first 9 genes
gene_name <- colnames(FPKM_data_2) # Extract gene names
library(gridExtra)
plot_array <- lapply(gene_name[1:9], histogram, "FPKM_data_2")
grid.arrange(do.call("arrangeGrob", c(plot_array, ncol = 3)))
```
**Interpretation:** Since some gene expression levels in FPKM are skewed, we use $log_2(FPKM + 0.5)$ to normalize the gene expression levels.
```{r eval=FALSE}
# Log2 transform FPKM for each gene
FPKM_data_log <- log2(FPKM_data_2 + 0.5)
saveRDS(object = FPKM_data_log, file = "FPKM_data_log.RDS")
```
## Check the distribution of gene expression in log2(FPKM + 0.5)
### First 9 genes
```{r}
# Create a batch histogram plotting function
histogram <- function(gene_name, dataset){
ggplot(data = get(dataset),
mapping = aes(x = get(gene_name))) +
geom_histogram(bins = 30, fill = "#B31B1B") +
theme_bw() +
labs(
y = "Count",
x = paste0(gene_name, " FPKM-T")) +
theme(plot.title = element_text(hjust = 0.5))
}
# Plot the distribution of gene expression levels of first 9 genes
gene_name <- colnames(FPKM_data_log
) # Extract gene names
library(gridExtra)
plot_array <- lapply(gene_name[1:9], histogram, "FPKM_data_log")
grid.arrange(do.call("arrangeGrob", c(plot_array, ncol = 3)))
```
## Import count data [Use this]
```{r}
# Load and wrangle count dataset
Count_origin <- read_csv("RNASeqReport_Dec28_2018_Counts.csv") %>%
rename("gene" = "...1")
# Check the original number of patients in the Count dataset
ncol(Count_origin) - 1 # There are 93 patients
# Check the original number of genes in the Count dataset
nrow(Count_origin) # There are 25369 genes
# Clean Count dataset
Count_data <- Count_origin %>%
t() %>% # Gene name as column names, and sample id as row names
janitor::row_to_names(., row_number = 1) %>% # Transform first-row values as column names
data.frame(check.names = FALSE) %>%
mutate(across(everything(), function(x) as.numeric(x)))
# Extract the Count sample id and check the updated number of patients
Count_sample_id <- rownames(Count_data)
# length(Count_sample_id) # There are 93 patients
# Find the common sample id between the Count dataset and lipidomics dataset
sample_id_intersect_count <- intersect(Count_sample_id, lipid_sample_id)