-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRefactorPriorKnoweldge.R
1590 lines (1371 loc) · 77.9 KB
/
RefactorPriorKnoweldge.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## ---------------------------
##
## Script name: GetPriorKnowledge
##
## Purpose of script: Create gene-metabolite sets for pathway enrichment analysis.
##
## Author: Christina Schmidt, Denes Turei and Macabe Daley
##
## Date Created: 2024-01-21
##
## Copyright (c) Christina Schmidt
## Email:
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------
##########################################################################################
### ### ### Translate IDs to/from KEGG, PubChem, Chebi, HMDB ### ### ###
##########################################################################################
#' Translate IDs to/from KEGG, PubChem, Chebi, HMDB
#'
#' @param InputData Dataframe with at least one column with the target (e.g. metabolite), you can add other columns such as source (e.g. term). Must be "long" DF, meaning one ID per row.
#' @param SettingsInfo \emph{Optional: } Column name of Target in Input_GeneSet. \strong{Default = list(InputID="MetaboliteID" , GroupingVariable="term")}
#' @param From ID type that is present in your data. Choose between "kegg", "pubchem", "chebi", "hmdb". \strong{Default = "kegg"}
#' @param To One or multiple ID types to which you want to translate your data. Choose between "kegg", "pubchem", "chebi", "hmdb". \strong{Default = c("pubchem","chebi","hmdb")}
#' @param Summary \emph{Optional: } If TRUE a long summary tables are created. \strong{Default = FALSE}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"}
#' @param FolderPath {Optional:} Path to the folder the results should be saved at. \strong{Default = NULL}
#'
#' @return List with at least three DFs: 1) Original data and the new column of translated ids spearated by comma. 2) Mapping information between Original ID to Translated ID. 3) Mapping summary between Original ID to Translated ID.
#'
#' @examples
#' KEGG_Pathways <- MetaProViz::LoadKEGG()
#' Res <- MetaProViz::TranslateID(InputData= KEGG_Pathways, SettingsInfo = c(InputID="MetaboliteID", GroupingVariable="term"), From = c("kegg"), To = c("pubchem", "hmdb"))
#'
#' @keywords Translate metabolite IDs
#'
#' @importFrom dplyr mutate select group_by ungroup distinct filter across n
#' @importFrom tidyselect all_of starts_with
#' @importFrom rlang !!! !! := sym syms
#' @importFrom OmnipathR id_types translate_ids
#' @importFrom logger log_warn
#' @importFrom stringr str_to_lower
#'
#' @export
#'
TranslateID <- function(InputData,
SettingsInfo = c(InputID="MetaboliteID", GroupingVariable="term"),
From = "kegg",
To = c("pubchem","chebi","hmdb"),
Summary=FALSE,
SaveAs_Table= "csv",
FolderPath=NULL
){# Add ability to also get metabolite names that are human readable from an ID type!
MetaProViz_Init()
## ------------------ Check Input ------------------- ##
# HelperFunction `CheckInput`
CheckInput(InputData=InputData,
InputData_Num=FALSE,
SaveAs_Table=SaveAs_Table)
# Specific checks:
if("InputID" %in% names(SettingsInfo)){
if(SettingsInfo[["InputID"]] %in% colnames(InputData)== FALSE){
message <- paste0("The ", SettingsInfo[["InputID"]], " column selected as InputID in SettingsInfo was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}
if("GroupingVariable" %in% names(SettingsInfo)){
if(SettingsInfo[["GroupingVariable"]] %in% colnames(InputData)== FALSE){
message <- paste0("The ", SettingsInfo[["GroupingVariable"]], " column selected as GroupingVariable in SettingsInfo was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}
if(is.logical(Summary) == FALSE){
message <- paste0("Check input. The Summary parameter should be either =TRUE or =FALSE.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
unknown_types <- OmnipathR::id_types() %>%
dplyr::select(tidyselect::starts_with('in_')) %>%
unlist %>%
unique %>%
str_to_lower %>%
setdiff(union(From, To), .)
if (length(unknown_types) > 0L) {
msg <- sprintf('The following ID types are not recognized: %s', paste(unknown_types, collapse = ', '))
logger::log_warn(msg)
warning(msg)
}
# Check that SettingsInfo[['InputID']] has no duplications within one group --> should not be the case --> remove duplications and inform the user/ ask if they forget to set groupings column
doublons <- InputData %>%
dplyr::filter(!is.na(!!sym(SettingsInfo[['InputID']]))) %>%
dplyr::group_by(!!sym(SettingsInfo[['InputID']]), !!sym(SettingsInfo[['GroupingVariable']]))%>%
dplyr::filter(dplyr::n() > 1) %>%
dplyr::summarize()
if(nrow(doublons) > 0){
message <- sprintf(
'The following IDs are duplicated within one group: %s',
paste(doublons %>% dplyr::pull(SettingsInfo[['InputID']]), collapse = ', ')
)
logger::log_warn(message)
warning(message)
}
## ------------------ Create output folders and path ------------------- ##
if(is.null(SaveAs_Table)==FALSE ){
Folder <- SavePath(FolderName= "PriorKnowledge",
FolderPath=FolderPath)
SubFolder <- file.path(Folder, "ID_Translation")
if (!dir.exists(SubFolder)) {dir.create(SubFolder)}
}
########################################################################################################################################################
## ------------------ Translate To-From for each pair ------------------- ##
TranslatedDF <- OmnipathR::translate_ids(
InputData,
!!sym(SettingsInfo[['InputID']]) := !!sym(From),
!!!syms(To),#list of symbols, hence three !!!
ramp = TRUE,
expand = FALSE,
quantify_ambiguity = TRUE,
qualify_ambiguity = TRUE,
ambiguity_groups = SettingsInfo[['GroupingVariable']],#Checks within the groups, without it checks across groups
ambiguity_summary = TRUE
)
#TranslatedDF %>% attributes %>% names
#TranslatedDF%>% attr('ambiguity_MetaboliteID_hmdb')
## --------------- Create output DF -------------------- ##
ResList <- list()
## Create DF for TranslatedIDs only with the original data and the translatedID columns
DF_subset <- TranslatedDF %>%
dplyr::select(tidyselect::all_of(intersect(names(.), names(InputData))), tidyselect::all_of(To)) %>%
dplyr::mutate(across(all_of(To), ~ map_chr(., ~ paste(unique(.), collapse = ", ")))) %>%
dplyr::group_by(!!sym(SettingsInfo[['InputID']]), !!sym(SettingsInfo[['GroupingVariable']])) %>%
dplyr::mutate(across(tidyselect::all_of(To), ~ paste(unique(.), collapse = ", "), .names = "{.col}")) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(To), ~ ifelse(. == "0", NA, .)))
ResList[["TranslatedDF"]] <- DF_subset
## Add DF with mapping information
ResList[["TranslatedDF_MappingInfo"]] <- TranslatedDF
## Also save the different mapping summaries!
for(item in To){
SummaryDF <- TranslatedDF%>% attr(paste0("ambiguity_", SettingsInfo[['InputID']], "_", item, sep=""))
ResList[[paste0("MappingSummary_", item, sep="")]] <- SummaryDF
}
## Create the long DF summary if Summary =TRUE
if(Summary==TRUE){
for(item in To){
Summary <- MetaProViz::MappingAmbiguity(InputData= TranslatedDF,
From = SettingsInfo[['InputID']],
To = item,
GroupingVariable = SettingsInfo[['GroupingVariable']],
Summary=TRUE)[["Summary"]]
ResList[[paste0("MappingSummary_Long_", From, "-to-", item, sep="")]] <- Summary
}
}
## ------------------ Save the results ------------------- ##
suppressMessages(suppressWarnings(
SaveRes(InputList_DF=ResList,
InputList_Plot= NULL,
SaveAs_Table=SaveAs_Table,
SaveAs_Plot=NULL,
FolderPath= SubFolder,
FileName= "TranslateID",
CoRe=FALSE,
PrintPlot=FALSE)))
#Return
invisible(return(ResList))
}
##########################################################################################
### ### ### Find additional potential IDs ### ### ###
##########################################################################################
#' Find additional potential IDs for "kegg", "pubchem", "chebi", "hmdb"
#'
#' @param InputData Dataframe with at least one column with the detected metabolite IDs (one ID per row).
#' @param SettingsInfo \emph{Optional: } Column name of metabolite IDs. \strong{Default = list(InputID="MetaboliteID")}
#' @param From ID type that is present in your data. Choose between "kegg", "pubchem", "chebi", "hmdb". \strong{Default = "hmdb"}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"}
#' @param FolderPath {Optional:} Path to the folder the results should be saved at. \strong{Default = NULL}
#'
#' @return Input DF with additional column including potential additional IDs.
#'
#' @examples
#' DetectedIDs <- MetaProViz::ToyData(Data="Cells_MetaData")%>% tibble::rownames_to_column("TrivialName")%>%tidyr::drop_na()
#' Res <- MetaProViz::EquivalentIDs(InputData= DetectedIDs, SettingsInfo = c(InputID="HMDB"), From = "hmdb")
#'
#' @keywords Find potential additional IDs for one metabolite identifier
#'
#' @importFrom dplyr mutate select group_by ungroup distinct filter across rowwise
#' @importFrom tidyr separate_rows unnest
#' @importFrom purrr map_chr
#' @importFrom tidyselect all_of starts_with
#' @importFrom rlang !!! !! := sym syms
#' @importFrom OmnipathR id_types translate_ids
#' @importFrom logger log_warn log_trace
#' @importFrom stringr str_to_lower str_split
#' @export
EquivalentIDs <- function(InputData,
SettingsInfo = c(InputID="MetaboliteID"),
From = "hmdb",
SaveAs_Table= "csv",
FolderPath=NULL){
# FUTURE: Once we have the structural similarity tool available in OmniPath, we can start creating this function!
### 1)
#Check Measured ID's in prior knowledge
### 2)
# A user has one HMDB IDs for their measured metabolites (one ID per measured peak) --> this is often the case as the user either gets a trivial name and they have searched for the ID themselves or because the facility only provides one ID at random
# We have mapped the HMDB IDs with the pathways and 20 do not map
# We want to check if it is because the pathways don't include them, or because the user just gave the wrong ID by chance (i.e. They picked D-Alanine, but the prior knowledge includes L-Alanine)
# Do this by using structural information via accessing the structural DB in OmniPath!
# Output is DF with the original ID column and a new column with additional possible IDs based on structure
#Is it possible to do this at the moment without structures, but by using other pior knowledge?
MetaProViz_Init()
## ------------------ Check Input ------------------- ##
# HelperFunction `CheckInput`
CheckInput(InputData=InputData,
InputData_Num=FALSE,
SaveAs_Table=SaveAs_Table)
# Specific checks:
if("InputID" %in% names(SettingsInfo)){
if(SettingsInfo[["InputID"]] %in% colnames(InputData)== FALSE){
message <- paste0("The ", SettingsInfo[["InputID"]], " column selected as InputID in SettingsInfo was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}
unknown_types <- OmnipathR::id_types() %>%
dplyr::select(tidyselect::starts_with('in_')) %>%
unlist %>%
unique %>%
str_to_lower %>%
setdiff(From, .)
if (length(unknown_types) > 0L) {
msg <- sprintf('The following ID types are not recognized: %s', paste(unknown_types, collapse = ', '))
logger::log_warn(msg)
warning(msg)
}
# Check that SettingsInfo[['InputID']] has no duplications within one group --> should not be the case --> remove duplications and inform the user/ ask if they forget to set groupings column
doublons <- InputData[duplicated(InputData[[SettingsInfo[['InputID']]]]), ]
if(nrow(doublons) > 0){
InputData <- InputData %>%
dplyr::distinct(!!sym(SettingsInfo[['InputID']]), .keep_all = TRUE)
message <- sprintf('The following IDs are duplicated and removed: %s',paste(doublons[[SettingsInfo[['InputID']]]], collapse = ', '))
logger::log_warn(message)
warning(message)
}
## ------------------ Create output folders and path ------------------- ##
if(is.null(SaveAs_Table)==FALSE ){
Folder <- SavePath(FolderName= "PriorKnowledge",
FolderPath=FolderPath)
SubFolder <- file.path(Folder, "EquivalentIDs")
if (!dir.exists(SubFolder)) {dir.create(SubFolder)}
}
## ------------------ Set the ID type for To ----------------- ##
To <- case_when(
From == "chebi" ~ "pubchem", # If To is "pubchem", choose "chebi"
TRUE ~ "chebi" # For other cases, don't use a secondary column
)
message <- paste0(To, " is used to find additional potential IDs for ", From, ".", sep="")
logger::log_trace(message)
message(message)
## ------------------ Load manual table ----------------- ##
if((From == "kegg") == FALSE){
EquivalentFeatures <- MetaProViz:: ToyData("EquivalentFeatures")%>%
dplyr::select(From)
}
## ------------------ Translate From-to-To ------------------- ##
TranslatedDF <- OmnipathR::translate_ids(
InputData,
!!sym(SettingsInfo[['InputID']]) := !!sym(From),
!!!syms(To),#list of symbols, hence three !!!
ramp = TRUE,
expand = FALSE,
quantify_ambiguity =FALSE,
qualify_ambiguity = TRUE, # Can not be set to FALSE!
ambiguity_groups = NULL,#Checks within the groups, without it checks across groups
ambiguity_summary = FALSE
)%>%
dplyr::select(tidyselect::all_of(intersect(names(.), names(InputData))), tidyselect::all_of(To)) %>%
dplyr::mutate(across(all_of(To), ~ purrr::map_chr(., ~ paste(unique(.), collapse = ", ")))) %>%
dplyr::group_by(!!sym(SettingsInfo[['InputID']])) %>%
dplyr::mutate(across(tidyselect::all_of(To), ~ paste(unique(.), collapse = ", "), .names = "{.col}")) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(To), ~ ifelse(. == "0", NA, .)))
## ------------------ Translate To-to-From ------------------- ##
TranslatedDF_Long <- TranslatedDF%>%
dplyr::select(!!sym(SettingsInfo[['InputID']]), !!sym(To))%>%
dplyr::rename("InputID" = !!sym(SettingsInfo[['InputID']]))%>%
tidyr::separate_rows(!!sym(To), sep = ", ") %>%
dplyr::mutate(across(all_of(To), ~trimws(.))) %>% # Remove extra spaces
dplyr::filter(!!sym(To) != "") # Remove empty entries
OtherIDs <- OmnipathR::translate_ids(
TranslatedDF_Long ,
!!sym(To),
!!sym(From),#list of symbols, hence three !!!
ramp = TRUE,
expand = FALSE,
quantify_ambiguity =FALSE,
qualify_ambiguity = TRUE, # Can not be set to FALSE!
ambiguity_groups = NULL,#Checks within the groups, without it checks across groups
ambiguity_summary = FALSE
)%>%
dplyr::select("InputID", !!sym(To), !!sym(From))%>%
dplyr::distinct(InputID, !!sym(From), .keep_all = TRUE) %>% # Remove duplicates based on InputID and From
dplyr::mutate(AdditionalID = dplyr::if_else(InputID == !!sym(From), FALSE, TRUE)) %>%
dplyr::select("InputID",!!sym(From), "AdditionalID")%>%
dplyr::filter(AdditionalID == TRUE) %>%
dplyr::mutate(across(all_of(From), ~ purrr::map_chr(., ~ paste(unique(.), collapse = ", "))))%>%
dplyr::rowwise() %>%
dplyr::mutate(
FromList = list(stringr::str_split(!!sym(From), ",\\s*")[[1]]), # Wrap in list
SameAsInput = ifelse(any(FromList == InputID), InputID, NA_character_), # Match InputID
PotentialAdditionalIDs = paste(FromList[FromList != InputID], collapse = ", ") # Combine other IDs
) %>%
dplyr::ungroup() %>%
dplyr::select(InputID, PotentialAdditionalIDs, hmdb)%>% # Final selection
dplyr::rename("AllIDs"= "hmdb")
## ------------------ Merge to Input ------------------- ##
OtherIDs <- merge(InputData, OtherIDs, by.x= SettingsInfo[['InputID']] , by.y= "InputID", all.x=TRUE)
##------------------- Add additional IDs -------------- ##
if (exists("EquivalentFeatures")) {
EquivalentFeatures$AllIDs <- EquivalentFeatures[[From]]
EquivalentFeatures_Long <- EquivalentFeatures %>%
separate_rows(!!sym(From), sep = ",")
OtherIDs <- merge(OtherIDs, EquivalentFeatures_Long, by.x= SettingsInfo[['InputID']] , by.y= "hmdb", all.x=TRUE)%>%
rowwise() %>%
mutate(AllIDs = paste(unique(na.omit(unlist(stringr::str_split(paste(na.omit(c(AllIDs.x, AllIDs.y)), collapse = ","), ",\\s*")))), collapse = ",")) %>%
ungroup()%>%
rowwise() %>%
mutate(
PotentialAdditionalIDs = paste(
setdiff(
unlist(stringr::str_split(AllIDs, ",\\s*")), # Split merged_column into individual IDs
as.character(!!sym(SettingsInfo[['InputID']])) # Split hmdb into individual IDs
),
collapse = ", " # Combine the remaining IDs back into a comma-separated string
)
) %>%
ungroup()%>%
select(-AllIDs.x, -AllIDs.y)
}
## ------------------ Create Output ------------------- ##
OutputDF <- OtherIDs
## ------------------ Save the results ------------------- ##
ResList <- list("EquivalentIDs" = OutputDF)
suppressMessages(suppressWarnings(
SaveRes(InputList_DF=ResList,
InputList_Plot= NULL,
SaveAs_Table=SaveAs_Table,
SaveAs_Plot=NULL,
FolderPath= SubFolder,
FileName= "EquivalentIDs",
CoRe=FALSE,
PrintPlot=FALSE)))
return(invisible(OutputDF))
}
##########################################################################################
### ### ### Mapping Ambiguity ### ### ###
##########################################################################################
#' Create Mapping Ambiguities between two ID types
#'
#' @param InputData Translated DF from MetaProViz::TranslateID reults or Dataframe with at least one column with the target metabolite ID and another MetaboliteID type. One of the IDs can only have one ID per row, the other ID can be either separated by comma or a list. Optional: add other columns such as source (e.g. term).
#' @param To Column name of original metabolite identifier in InputData. Here should only have one ID per row.
#' @param From Column name of the secondary or translated metabolite identifier in InputData. Here can be multiple IDs per row either separated by comma " ," or a list of IDs.
#' @param GroupingVariable \emph{Optional: } If NULL no groups are used. If TRUE provide column name in InputData containing the GroupingVariable and features are grouped. \strong{Default = NULL}
#' @param Summary \emph{Optional: } If TRUE a long summary tables are created. \strong{Default = FALSE}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"}
#' @param FolderPath {Optional:} Path to the folder the results should be saved at. \strong{Default = NULL}
#'
#' @return List with at least 4 DFs: 1-3) From-to-To: 1. MappingIssues, 2. MappingIssues Summary, 3. Long summary (If Summary=TRUE) & 4-6) To-to-From: 4. MappingIssues, 5. MappingIssues Summary, 6. Long summary (If Summary=TRUE) & 7) Combined summary table (If Summary=TRUE)
#'
#' @examples
#' KEGG_Pathways <- MetaProViz::LoadKEGG()
#' InputDF <- MetaProViz::TranslateID(InputData= KEGG_Pathways, SettingsInfo = c(InputID="MetaboliteID", GroupingVariable="term"), From = c("kegg"), To = c("pubchem"))[["TranslatedDF"]]
#' Res <- MetaProViz::MappingAmbiguity(InputData= InputDF, From = "MetaboliteID", To = "pubchem", GroupingVariable = "term", Summary=TRUE)
#'
#' @keywords Mapping ambiguity
#'
#' @importFrom dplyr mutate bind_cols bind_rows
#' @importFrom rlang !!! !! := sym syms
#' @importFrom OmnipathR ambiguity
#'
#' @export
#'
MappingAmbiguity <- function(InputData,
From,
To,
GroupingVariable = NULL,
Summary=FALSE,
SaveAs_Table= "csv",
FolderPath=NULL
) {
MetaProViz_Init()
## ------------------ Check Input ------------------- ##
# HelperFunction `CheckInput`
CheckInput(InputData=InputData,
InputData_Num=FALSE,
SaveAs_Table=SaveAs_Table)
# Specific checks:
if(From %in% colnames(InputData)== FALSE){
message <- paste0(From, " column was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
if(To %in% colnames(InputData)== FALSE){
message <- paste0(To, " column was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
if(is.null(GroupingVariable)==FALSE){
if(GroupingVariable %in% colnames(InputData)== FALSE){
message <- paste0(GroupingVariable, " column was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}
if(is.logical(Summary) == FALSE){
message <- paste0("Check input. The Summary parameter should be either =TRUE or =FALSE.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
## ------------------ General checks of wrong occurences ------------------- ##
# Task 1: Check that From has no duplications within one group --> should not be the case --> remove duplications and inform the user/ ask if they forget to set groupings column
# Task 2: Check that From has the same items in to across the different entries (would be in different Groupings, otherwise there should not be any duplications) --> List of Miss-Mappings across terms
# FYI: The above can not happen if our translateID function was used, but may be the case when the user has done something manually before
## ------------------ Create output folders and path ------------------- ##
if(is.null(SaveAs_Table)==FALSE ){
Folder <- SavePath(FolderName= "PriorKnowledge",
FolderPath=FolderPath)
SubFolder <- file.path(Folder, "MappingAmbiguities")
if (!dir.exists(SubFolder)) {dir.create(SubFolder)}
}
#####################################################################################################################################################################################
## ------------------ Prepare Input data ------------------- ##
#If the user provides a DF where the To column is a list of IDs, then we can use it right away
#If the To column is not a list of IDs, but a character column, we need to convert it into a list of IDs
if(is.character(InputData[[To]])==TRUE){
InputData[[To]] <- InputData[[To]]%>%
strsplit(", ")%>%
lapply(as.character)
}
## ------------------ Perform ambiguity mapping ------------------- ##
#1. From-to-To: OriginalID-to-TranslatedID
#2. From-to-To: TranslatedID-to-OriginalID
Comp <- list(
list(From = From, To = To),
list(From = To, To = From)
)
ResList <- list()
for(comp in seq_along(Comp)){
#Run Omnipath ambiguity
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]] <- InputData %>%
tidyr::unnest(cols = all_of(Comp[[comp]]$From))%>% # unlist the columns in case they are not expaned
filter(!is.na(!!sym(Comp[[comp]]$From)))%>%#Remove NA values, otherwise they are counted as column is character
OmnipathR::ambiguity(
from_col = !!sym(Comp[[comp]]$From),
to_col = !!sym(Comp[[comp]]$To),
groups = GroupingVariable,
quantify = TRUE,
qualify = TRUE,
global = TRUE,#across groups will be done additionally --> suffix _AcrossGroup
summary=TRUE, #summary of the mapping column
expand=TRUE)
#Extract summary table:
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To, "_Summary", sep="")]] <-
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]]%>%
attr(paste0("ambiguity_", Comp[[comp]]$From , "_",Comp[[comp]]$To, sep=""))
############################################################################################################
if(Summary==TRUE){
if(is.null(GroupingVariable)==FALSE){
# Add further information we need to summarise the table and combine Original-to-Translated and Translated-to-Original
# If we have a GroupingVariable we need to combine it with the MetaboliteID before merging
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To, "_Long", sep="")]] <- ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]]%>%
tidyr::unnest(cols = all_of(Comp[[comp]]$From))%>%
mutate(!!sym(paste0("AcrossGroupMappingIssue(", Comp[[comp]]$From, "_to_", Comp[[comp]]$To, ")", sep="")) := case_when(
!!sym(paste0(Comp[[comp]]$From, "_", Comp[[comp]]$To, "_ambiguity_bygroup", sep="")) != !!sym(paste0(Comp[[comp]]$From, "_", Comp[[comp]]$To, "_ambiguity", sep="")) ~ "TRUE",
TRUE ~ "FALSE" ))%>%
group_by(!!sym(Comp[[comp]]$From), !!sym(GroupingVariable))%>%
mutate(!!sym(Comp[[comp]]$To) := ifelse(!!sym(Comp[[comp]]$From) == 0, NA, # Or another placeholder
paste(unique(!!sym(Comp[[comp]]$To)), collapse = ", ")
)) %>%
mutate( !!sym(paste0("Count(", Comp[[comp]]$From, "_to_", Comp[[comp]]$To, ")")) := ifelse(all(!!sym(Comp[[comp]]$To) == 0), 0, n()))%>%
ungroup()%>%
distinct() %>%
unite(!!sym(paste0(Comp[[comp]]$From, "_to_", Comp[[comp]]$To)), c(Comp[[comp]]$From, Comp[[comp]]$To), sep=" --> ", remove=FALSE)%>%
separate_rows(!!sym(Comp[[comp]]$To), sep = ", ") %>%
unite(UniqueID, c(From, To, GroupingVariable), sep="_", remove=FALSE)%>%
distinct()
}else{
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To, "_Long", sep="")]] <- ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]]%>%
tidyr::unnest(cols = all_of(Comp[[comp]]$From))%>%
group_by(!!sym(Comp[[comp]]$From))%>%
mutate(!!sym(Comp[[comp]]$To) := ifelse(!!sym(Comp[[comp]]$From) == 0, NA, # Or another placeholder
paste(unique(!!sym(Comp[[comp]]$To)), collapse = ", ")
)) %>%
mutate( !!sym(paste0("Count(", Comp[[comp]]$From, "_to_", Comp[[comp]]$To, ")")) := ifelse(all(!!sym(Comp[[comp]]$To) == 0), 0, n()))%>%
ungroup()%>%
distinct() %>%
unite(!!sym(paste0(Comp[[comp]]$From, "_to_", Comp[[comp]]$To)), c(Comp[[comp]]$From, Comp[[comp]]$To), sep=" --> ", remove=FALSE)%>%
separate_rows(!!sym(Comp[[comp]]$To), sep = ", ") %>%
unite(UniqueID, c(From, To), sep="_", remove=FALSE)%>%
distinct()%>%
mutate(!!sym(paste0("AcrossGroupMappingIssue(", From, "_to_", To, ")", sep="")) := NA)
}
}
# Add NA metabolite maps back if they do exist:
Removed <- InputData %>%
tidyr::unnest(cols = all_of(Comp[[comp]]$From))%>% # unlist the columns in case they are not expaned
filter(is.na(!!sym(Comp[[comp]]$From)))
if(nrow(Removed)>0){
ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]] <- dplyr::bind_rows(ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]],
test<- Removed%>%
dplyr::bind_cols(setNames(as.list(rep(NA, length(setdiff(names(ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]]), names(Removed))))),
setdiff(names(ResList[[paste0(Comp[[comp]]$From, "-to-", Comp[[comp]]$To , sep="")]]), names(Removed))))
)
}
}
## ------------------ Create SummaryTable ------------------- ##
if(Summary==TRUE){
# Combine the two tables
Summary <- merge(x= ResList[[paste0(From, "-to-", To, "_Long", sep="")]][,c("UniqueID", paste0(From, "_to_", To), paste0("Count(", From, "_to_", To, ")"), paste0("AcrossGroupMappingIssue(", From, "_to_", To, ")", sep=""))],
y= ResList[[paste0(To, "-to-", From, "_Long", sep="")]][,c("UniqueID", paste0(To, "_to_", From), paste0("Count(", To, "_to_", From, ")"), paste0("AcrossGroupMappingIssue(", To, "_to_", From, ")", sep=""))],
by = "UniqueID",
all = TRUE)%>%
separate(UniqueID, into = c(From, To, GroupingVariable), sep="_", remove=FALSE)%>%
distinct()
# Add relevant mapping information
Summary <- Summary %>%
mutate(Mapping = case_when(
!!sym(paste0("Count(", From, "_to_", To, ")")) == 1 & !!sym(paste0("Count(", To, "_to_", From, ")")) == 1 ~ "one-to-one",
!!sym(paste0("Count(", From, "_to_", To, ")")) > 1 & !!sym(paste0("Count(", To, "_to_", From, ")")) == 1 ~ "one-to-many",
!!sym(paste0("Count(", From, "_to_", To, ")")) > 1 & !!sym(paste0("Count(", To, "_to_", From, ")")) > 1 ~ "many-to-many",
!!sym(paste0("Count(", From, "_to_", To, ")")) == 1 & !!sym(paste0("Count(", To, "_to_", From, ")")) > 1 ~ "many-to-one",
!!sym(paste0("Count(", From, "_to_", To, ")")) >= 1 & !!sym(paste0("Count(", To, "_to_", From, ")")) == NA ~ "one-to-none",
!!sym(paste0("Count(", From, "_to_", To, ")")) >= 1 & is.na(!!sym(paste0("Count(", To, "_to_", From, ")"))) ~ "one-to-none",
!!sym(paste0("Count(", From, "_to_", To, ")")) == NA & !!sym(paste0("Count(", To, "_to_", From, ")")) >= 1 ~ "none-to-one",
is.na(!!sym(paste0("Count(", From, "_to_", To, ")"))) & !!sym(paste0("Count(", To, "_to_", From, ")")) >= 1 ~ "none-to-one",
TRUE ~ NA )) %>%
mutate( !!sym(paste0("Count(", From, "_to_", To, ")")) := replace_na( !!sym(paste0("Count(", From, "_to_", To, ")")), 0)) %>%
mutate( !!sym(paste0("Count(", To, "_to_", From, ")")) := replace_na( !!sym(paste0("Count(", To, "_to_", From, ")")), 0))
ResList[["Summary"]] <- Summary
}
## ------------------ Save the results ------------------- ##
suppressMessages(suppressWarnings(
SaveRes(InputList_DF=ResList,
InputList_Plot= NULL,
SaveAs_Table=SaveAs_Table,
SaveAs_Plot=NULL,
FolderPath= SubFolder,
FileName= "MappingAmbiguity",
CoRe=FALSE,
PrintPlot=FALSE)))
#Return
invisible(return(ResList))
}
##########################################################################################
### ### ### Check Measured ID's in prior knowledge ### ### ###
##########################################################################################
#' Check and summarize PriorKnowledge-to-MeasuredFeatures relationship
#'
#' @param InputData Dataframe with at least one column with the detected metabolite IDs (e.g. HMDB). If there are multiple IDs per detected peak, please separate them by comma ("," or ", " or chr list). If there is a main ID and additional IDs, please provide them in separate columns.
#' @param PriorKnowledge Dataframe with at least one column with the metabolite ID (e.g. HMDB) that need to match InputData metabolite IDs "source" (e.g. term). If there are multiple IDs, as the original pathway IDs (e.g. KEGG) where translated (e.g. to HMDB), please separate them by comma ("," or ", " or chr list).
#' @param SettingsInfo Colum name of Metabolite IDs in InputData and PriorKnowledge as well as column name of GroupingVariable in PriorKnowledge. \strong{Default = c(InputID="HMDB", PriorID="HMDB", GroupingVariable="term")}
#' @param SaveAs_Table \emph{Optional: } File types for the analysis results are: "csv", "xlsx", "txt". \strong{Default = "csv"}
#' @param FolderPath {Optional:} Path to the folder the results should be saved at. \strong{Default = NULL}
#'
#' @importFrom dplyr mutate
#' @importFrom rlang !!! !! := sym syms
#'
#' @examples
#' DetectedIDs <- MetaProViz::ToyData(Data="Cells_MetaData")%>% rownames_to_column("Metabolite") %>%dplyr::select("Metabolite", "HMDB")%>%tidyr::drop_na()
#' PathwayFile <- MetaProViz::TranslateID(InputData= MetaProViz::LoadKEGG(), SettingsInfo = c(InputID="MetaboliteID", GroupingVariable="term"), From = c("kegg"), To = c("hmdb"))[["TranslatedDF"]]%>%tidyr::drop_na()
#' Res <- MetaProViz::CheckMatchID(InputData= DetectedIDs, PriorKnowledge= PathwayFile, SettingsInfo = c(InputID="HMDB", PriorID="hmdb", GroupingVariable="term"))
#'
#' @noRd
#'
CheckMatchID <- function(InputData,
PriorKnowledge,
SettingsInfo = c(InputID="HMDB", PriorID="HMDB", GroupingVariable="term"),
SaveAs_Table= "csv",
FolderPath=NULL
){
## ------------ Create log file ----------- ##
MetaProViz_Init()
## ------------ Check Input files ----------- ##
## InputData:
if("InputID" %in% names(SettingsInfo)){
if(SettingsInfo[["InputID"]] %in% colnames(InputData)== FALSE){
message <- paste0("The ", SettingsInfo[["InputID"]], " column selected as InpuID in SettingsInfo was not found in InputData. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}else{
message <- paste0("No ", SettingsInfo[["InputID"]], " provided. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
### This is after the main input checks (before NA removal), so we will save original df here for later merging to get the Null and duplicates back.
InputData_Original <- InputData
if(sum(is.na(InputData[[SettingsInfo[["InputID"]]]])) >=1){#remove NAs:
message <- paste0(sum(is.na(InputData[[SettingsInfo[["InputID"]]]])), " NA values were removed from column", SettingsInfo[["InputID"]])
logger::log_trace(paste("Warning: ", message, sep=""))
InputData <- InputData %>%
filter(!is.na(.data[[SettingsInfo[["InputID"]]]]))
warning(message)
}
if(nrow(InputData) - nrow(distinct(InputData, .data[[SettingsInfo[["InputID"]]]])) >= 1){# Remove duplicate IDs
message <- paste0(nrow(InputData) - nrow(distinct(InputData, .data[[SettingsInfo[["InputID"]]]])), " duplicated IDs were removed from column", SettingsInfo[["InputID"]])
logger::log_trace(paste("Warning: ", message, sep=""))
InputData <- InputData %>%
distinct(.data[[SettingsInfo[["InputID"]]]], .keep_all = TRUE)
warning(message)
}
InputData_MultipleIDs <- any(
grepl(",\\s*", InputData[[SettingsInfo[["InputID"]]]]) | # Comma-separated
sapply(InputData[[SettingsInfo[["InputID"]]]] , function(x) {
if (grepl("^c\\(|^list\\(", x)) {
parsed <- tryCatch(eval(parse(text = x)), error = function(e) NULL)
return(is.list(parsed) && length(parsed) > 1 || is.vector(parsed) && length(parsed) > 1)
}
FALSE
})
)
## PriorKnowledge:
if("PriorID" %in% names(SettingsInfo)){
if(SettingsInfo[["PriorID"]] %in% colnames(PriorKnowledge)== FALSE){
message <- paste0("The ", SettingsInfo[["PriorID"]], " column selected as InpuID in SettingsInfo was not found in PriorKnowledge. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}else{
message <- paste0("No ", SettingsInfo[["PriorID"]], " provided. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
### This is after the main input checks (before NA removal), so we will save original df here for later merging to get the Null and duplicates back.
PriorKnowledge_Original <- PriorKnowledge
if(sum(is.na(PriorKnowledge[[SettingsInfo[["PriorID"]]]])) >=1){#remove NAs:
message <- paste0(sum(is.na(PriorKnowledge[[SettingsInfo[["PriorID"]]]])), " NA values were removed from column", SettingsInfo[["PriorID"]])
logger::log_trace(paste("Warning: ", message, sep=""))
PriorKnowledge <- PriorKnowledge %>%
filter(!is.na(.data[[SettingsInfo[["PriorID"]]]]))
warning(message)
}
if("GroupingVariable" %in% names(SettingsInfo)){#Add GroupingVariable
if(SettingsInfo[["GroupingVariable"]] %in% colnames(PriorKnowledge)== FALSE){
message <- paste0("The ", SettingsInfo[["GroupingVariable"]], " column selected as InpuID in SettingsInfo was not found in PriorKnowledge. Please check your input.")
logger::log_trace(paste("Error ", message, sep=""))
stop(message)
}
}else{
#Add GroupingVariable
SettingsInfo["GroupingVariable"] <- "GroupingVariable"
PriorKnowledge["GroupingVariable"] <- "None"
message <- paste0("No ", SettingsInfo[["PriorID"]], " provided. If this was not intentional, please check your input.")
logger::log_trace(message)
message(message)
}
if(nrow(PriorKnowledge) - nrow(distinct(PriorKnowledge, .data[[SettingsInfo[["PriorID"]]]], .data[[SettingsInfo[["GroupingVariable"]]]])) >= 1){# Remove duplicate IDs
message <- paste0(nrow(PriorKnowledge) - nrow(distinct(PriorKnowledge, .data[[SettingsInfo[["PriorID"]]]], .data[[SettingsInfo[["GroupingVariable"]]]])) , " duplicated IDs were removed from column", SettingsInfo[["PriorID"]])
logger::log_trace(paste("Warning: ", message, sep=""))
PriorKnowledge <- PriorKnowledge %>%
distinct(.data[[SettingsInfo[["PriorID"]]]], !!sym(SettingsInfo[["GroupingVariable"]]), .keep_all = TRUE)%>%
group_by(!!sym(SettingsInfo[["PriorID"]])) %>%
mutate(across(everything(), ~ if (is.character(.)) paste(unique(.), collapse = ", ")))%>%
ungroup()%>%
distinct(.data[[SettingsInfo[["PriorID"]]]], .keep_all = TRUE)
warning(message)
}
PK_MultipleIDs <- any(# Check if multiple IDs are present:
grepl(",\\s*", PriorKnowledge[[SettingsInfo[["PriorID"]]]]) | # Comma-separated
sapply(PriorKnowledge[[SettingsInfo[["PriorID"]]]] , function(x) {
if (grepl("^c\\(|^list\\(", x)) {
parsed <- tryCatch(eval(parse(text = x)), error = function(e) NULL)
return(is.list(parsed) && length(parsed) > 1 || is.vector(parsed) && length(parsed) > 1)
}
FALSE
})
)
## ------------ Create Results output folder ----------- ##
if(is.null(SaveAs_Table)==FALSE){
Folder <- SavePath(FolderName= "PriorKnowledgeChecks",
FolderPath=FolderPath)
SubFolder <- file.path(Folder, "CheckMatchID_Detected-to-PK")
if (!dir.exists(SubFolder)) {dir.create(SubFolder)}
}
######################################################################################################################################
## ------------ Check how IDs match and if needed remove unmatched IDs ----------- ##
# 1. Create long DF
create_long_df <- function(df, id_col, df_name) {
df %>%
mutate(row_id = dplyr::row_number()) %>%
mutate(!!paste0("OriginalEntry_", df_name, sep="") := !!sym(id_col)) %>% # Store original values
separate_rows(!!sym(id_col), sep = ",\\s*") %>%
group_by(row_id) %>%
mutate(!!(paste0("OriginalGroup_", df_name, sep="")) := paste0(df_name, "_", dplyr::cur_group_id())) %>%
ungroup()
}
if(InputData_MultipleIDs){
InputData_long <- create_long_df(InputData, SettingsInfo[["InputID"]], "InputData")%>%
select(SettingsInfo[["InputID"]],"OriginalEntry_InputData", OriginalGroup_InputData)
}else{
InputData_long <- InputData %>%
mutate(OriginalGroup_InputData := paste0("InputData_", dplyr::row_number()))%>%
select(SettingsInfo[["InputID"]], OriginalGroup_InputData)
}
if(PK_MultipleIDs){
PK_long <- create_long_df(PriorKnowledge, SettingsInfo[["PriorID"]], "PK")%>%
select(SettingsInfo[["PriorID"]], "OriginalEntry_PK", OriginalGroup_PK, SettingsInfo[["GroupingVariable"]])
}else{
PK_long <- PriorKnowledge %>%
mutate(OriginalGroup_PK := paste0("PK_", dplyr::row_number()))%>%
select(SettingsInfo[["PriorID"]],OriginalGroup_PK, SettingsInfo[["GroupingVariable"]])
}
# 2. Merge DF
merged_df <- merge(PK_long, InputData_long, by.x= SettingsInfo[["PriorID"]], by.y= SettingsInfo[["InputID"]], all=TRUE)%>%
distinct(!!sym(SettingsInfo[["PriorID"]]), OriginalGroup_InputData, .keep_all = TRUE)
#3. Add information to summarize and describe problems
merged_df <- merged_df %>%
# num_PK_entries
group_by(OriginalGroup_PK, !!sym(SettingsInfo[["GroupingVariable"]])) %>%
mutate(
num_PK_entries = sum(!is.na(OriginalGroup_PK)),
num_PK_entries_groups = dplyr::n_distinct(OriginalGroup_PK, na.rm = TRUE)) %>% # count the times we have the same PK_entry match with multiple InputData entries --> extend below!
ungroup()%>%
# num_Input_entries
group_by(OriginalGroup_InputData, !!sym(SettingsInfo[["GroupingVariable"]])) %>%
mutate(
num_Input_entries = sum(!is.na(OriginalGroup_InputData)),
num_Input_entries_groups = dplyr::n_distinct(OriginalGroup_InputData,, na.rm = TRUE))%>%
ungroup()%>%
mutate(
ActionRequired = case_when(
num_Input_entries ==1 & num_Input_entries_groups == 1 & num_PK_entries_groups == 1 ~ "None",
num_Input_entries ==1 & num_Input_entries_groups == 1 & num_PK_entries_groups >= 2 ~ "Check",
num_Input_entries ==1 & num_Input_entries_groups >= 2 & num_PK_entries_groups == 1 ~ "Check",
num_Input_entries > 1 & num_Input_entries_groups == 1 & num_PK_entries_groups >= 2 ~ "Check",
num_Input_entries > 1 & num_Input_entries_groups >= 2 & num_PK_entries_groups >= 2 ~ "Check",
num_Input_entries == 0 ~ "None", # ID(s) of PK not measured
TRUE ~ NA_character_
)
)%>%
mutate(
Detection = case_when(
num_Input_entries ==1 & num_Input_entries_groups == 1 & num_PK_entries_groups == 1 ~ "One input ID of the same group maps to at least ONE PK ID of ONE group",
num_Input_entries ==1 & num_Input_entries_groups == 1 & num_PK_entries_groups >= 2 ~ "One input ID of the same group maps to at least ONE PK ID of MANY groups",
num_Input_entries ==1 & num_Input_entries_groups >= 2 & num_PK_entries_groups == 1 ~ "One input ID of MANY groups maps to at least ONE PK ID of ONE groups",
num_Input_entries > 1 & num_Input_entries_groups == 1 & num_PK_entries_groups >= 2 ~ "MANY input IDs of the same group map to at least ONE PK ID of MANY PK groups",
num_Input_entries > 1 & num_Input_entries_groups >= 2 & num_PK_entries_groups >= 2 ~ "MANY input IDs of the MANY groups map to at least ONE PK ID of MANY PK groups",
num_Input_entries == 0 ~ "Not Detected", # ID(s) of PK not measured
TRUE ~ NA_character_
)
)
# Handle "Detected-to-PK" (When PK has multiple IDs)
#group_by(OriginalGroup_PK, !!sym(SettingsInfo[["GroupingVariable"]])) %>%
#mutate(
# `Detected-to-PK` = case_when(
# num_Input_entries == 0 & num_PK_entries == 1 ~ "none-to-one", # No match & OriginalGroup_PK appears once
# num_Input_entries == 0 & num_PK_entries > 1 ~ "none-to-many", # No match & OriginalGroup_PK appears multiple times
# num_Input_entries == 1 & num_PK_entries == 1 ~ "one-to-one", # One unique match in InputData, one PK
# num_Input_entries == 1 & num_PK_entries > 1 ~ "one-to-many", # One unique match in InputData, multiple PKs
# num_Input_entries > 1 & num_PK_entries == 1 ~ "many-to-one", # Multiple matches in InputData, one PK
# num_Input_entries > 1 & num_PK_entries > 1 ~ "many-to-many", # Multiple matches in InputData, multiple PKs
# num_Input_entries == 1 & num_PK_entries == 0 ~ "one-to-none",
# num_Input_entries > 1 & num_PK_entries == 0 ~ "many-to-none",
# TRUE ~ NA_character_
# )
#) %>%
#ungroup() %>%
# Handle "PK-to-Detected" (When InputData has multiple IDs)
#group_by(OriginalGroup_InputData, !!sym(SettingsInfo[["GroupingVariable"]])) %>%
#mutate(
# `PK-to-Detected` = case_when(
# num_PK_entries == 0 & num_Input_entries == 1 ~ "none-from-one",
# num_PK_entries == 0 & num_Input_entries > 1 ~ "none-from-many",
# num_PK_entries == 1 & num_Input_entries == 1 ~ "one-from-one",
# num_PK_entries == 1 & num_Input_entries > 1 ~ "one-from-many",
# num_PK_entries > 1 & num_Input_entries == 1 ~ "many-from-one",
# num_PK_entries > 1 & num_Input_entries > 1 ~ "many-from-many",
# num_PK_entries > 1 & num_Input_entries == 0 ~ "many-from-none",
# num_PK_entries == 1 & num_Input_entries == 0 ~ "one-from-none",
# TRUE ~ NA_character_
# )
#) %>%
#ungroup() %>%
# Assign ActionRequired (If many-to-many in either case)
#mutate(
# ActionRequired = case_when(
# `Detected-to-PK` == "many-to-many" | `PK-to-Detected` == "many-from-many" ~ "Check",
# TRUE ~ "None"
# )
#)
# 4. Create summary table
Values_InputData <- unique(InputData[[SettingsInfo[["InputID"]]]])
Values_PK <- unique(PK_long[[SettingsInfo[["PriorID"]]]])
summary_df <- tibble::tibble(
!!sym(SettingsInfo[["InputID"]]) := Values_InputData,
found_match_in_PK = NA,
matches = NA_character_,
match_overlap_percentage = NA_real_,
original_count = NA_integer_,
matches_count = NA_integer_
)
# Populate the summary data frame
for(i in seq_along(Values_InputData)) {
# Handle NA case explicitly
if (is.na(Values_InputData[i])) {
summary_df$original_count[i] <- 0
summary_df$matches_count[i] <- 0
summary_df$match_overlap_percentage[i] <- NA
summary_df$found_match_in_PK[i] <- NA # could also set it to FALSE but making NA for now for plotting
summary_df$matches[i] <- NA
} else {
# Split each cell into individual entries and trim whitespace
entries <- trimws(unlist(strsplit(as.character(Values_InputData[i]), ",\\s*"))) # delimiter = "," or ", "
# Identify which entries are in the lookup set
matched <- entries[entries %in% Values_PK]
# Determine if any match was found
summary_df$found_match_in_PK[i] <- length(matched) > 0
# Concatenate matched entries into a single string
summary_df$matches[i] <- paste(matched, collapse = ", ")
# Calculate and store counts
summary_df$original_count[i] <- length(entries)
summary_df$matches_count[i] <- length(matched)
# Calculate fraction: matched entries / total entries
if(length(entries) > 0) {
summary_df$match_overlap_percentage[i] <- (length(matched) / length(entries))*100
} else {
summary_df$match_overlap_percentage[i] <- NA
}
}
}
summary_df <- merge(x= summary_df,
y= merged_df%>%
dplyr::select(-c(OriginalGroup_PK, OriginalGroup_InputData))%>%
distinct(!!sym(SettingsInfo[["PriorID"]]), .keep_all = TRUE),
by.x= SettingsInfo[["InputID"]] ,
by.y= SettingsInfo[["PriorID"]],
all.x=TRUE)
if(PK_MultipleIDs){
summary_df <- merge(x= summary_df, y= InputData, by=SettingsInfo[["InputID"]], all.x=TRUE)%>%
distinct(!!sym(SettingsInfo[["InputID"]]), OriginalEntry_PK, .keep_all = TRUE)
}else{
summary_df <- merge(x= summary_df, y= InputData, by=SettingsInfo[["InputID"]], all.x=TRUE)%>%
distinct(!!sym(SettingsInfo[["InputID"]]), .keep_all = TRUE)
}
# 5. Merge back on input data to retain Nulls and duplications in case the user wants this (e.g. for plotting or inspecting further)
# Function: add_NA_to_table
#
# Description:
# This function takes two data frames:
# - table_with_NA: the original table that may contain rows where the key column is NA.
# - table_without_NA: a processed table (e.g. from a join) that contains extra columns and excludes rows where the key is NA.
#
# The function extracts rows from table_with_NA where the key is NA, extends these rows by adding any extra columns
# (present in table_without_NA but not in table_with_NA) with NA values, and then binds these extended rows to table_without_NA.
#
# Parameters:
# table_with_NA: Data frame containing the original data (e.g. FeatureMetadata_Biocrates).
# table_without_NA: Data frame containing the processed data with extra columns (e.g. tempnew).
# key: The column name (as a string) used as the key for matching (e.g. "HMDB").
#
# Returns:
# A combined data frame that includes the rows from table_without_NA along with the extended NA rows from table_with_NA.
add_NA_to_table <- function(table_with_NA, table_without_NA, key) {
# Subset rows from the original table where the key column is NA
na_rows <- dplyr::filter(table_with_NA, is.na(.data[[key]]))
# Identify extra columns present in table_without_NA that are not in the original table
extra_cols <- setdiff(names(table_without_NA), names(table_with_NA))