@@ -399,10 +399,10 @@ parse_mitcr <- function(.filename, .mode) {
399
399
)
400
400
}
401
401
402
- parse_mixcr <- function (.filename , .mode ) {
402
+ parse_mixcr <- function (.filename , .mode , .count = c( " clonecount " , " readcount " ) ) {
403
403
.filename <- .filename
404
404
.id <- " cloneid"
405
- .count <- " clonecount "
405
+ .count % <> % tolower()
406
406
.sep <- " \t "
407
407
.vd.insertions <- " VD.insertions"
408
408
.dj.insertions <- " DJ.insertions"
@@ -677,16 +677,21 @@ parse_mixcr <- function(.filename, .mode) {
677
677
df [[pos_extra_headers [[" j3del" ]]]] <- sapply(df [[" refpoints" ]], get_ref_point_position , 18 )
678
678
}
679
679
680
- if (! (.count %in% table.colnames )) {
680
+ if (! any (.count %in% table.colnames )) {
681
681
warn_msg <- c(" [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1." )
682
682
warn_msg <- c(warn_msg , " \n Did you apply repLoad to MiXCR file *_alignments.txt?" )
683
683
warn_msg <- c(warn_msg , " If so please consider moving all *.clonotypes.*.txt MiXCR files to" )
684
684
warn_msg <- c(warn_msg , " a separate folder and apply repLoad to the folder." )
685
685
warn_msg <- c(warn_msg , " \n Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis." )
686
686
message(warn_msg )
687
687
688
+ .count <- .count [1 ]
688
689
df [[.count ]] <- 1
690
+ } else if (length(.count ) > 1 ) {
691
+ # if multiple column name options specified for .count, keep only the first valid
692
+ .count <- .count [.count %in% table.colnames ][1 ]
689
693
}
694
+
690
695
.freq <- " Proportion"
691
696
df $ Proportion <- df [[.count ]] / sum(df [[.count ]], na.rm = TRUE )
692
697
@@ -829,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
829
834
}
830
835
831
836
parse_vdjtools <- function (.filename , .mode ) {
832
- skip <- 0
833
-
834
837
# Check for different VDJtools outputs
835
838
f <- file(.filename , " r" )
836
839
l <- readLines(f , 1 )
@@ -959,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
959
962
.as_tsv() %> %
960
963
airr :: read_rearrangement()
961
964
962
- df <- df %> %
963
- select(
964
- sequence , v_call , d_call , j_call , junction , junction_aa ,
965
- contains(" v_germline_end" ), contains(" d_germline_start" ), contains(" d_germline_end" ),
966
- contains(" j_germline_start" ), contains(" np1_length" ), contains(" np2_length" ),
967
- contains(" duplicate_count" )
965
+ df %<> %
966
+ select_(
967
+ " sequence" , " v_call" , " d_call" , " j_call" , " junction" , " junction_aa" ,
968
+ ~ contains(" v_germline_end" ), ~ contains(" d_germline_start" ),
969
+ ~ contains(" d_germline_end" ), ~ contains(" j_germline_start" ),
970
+ ~ contains(" np1_length" ), ~ contains(" np2_length" ),
971
+ ~ contains(" duplicate_count" ),
972
+ " cdr1" , " cdr2" , " cdr1_aa" , " cdr2_aa" , " fwr1" , " fwr2" , " fwr3" , " fwr4" ,
973
+ " fwr1_aa" , " fwr2_aa" , " fwr3_aa" , " fwr4_aa"
968
974
)
969
975
970
976
namekey <- c(
971
977
duplicate_count = IMMCOL $ count , junction = IMMCOL $ cdr3nt , junction_aa = IMMCOL $ cdr3aa ,
972
978
v_call = IMMCOL $ v , d_call = IMMCOL $ d , j_call = IMMCOL $ j , v_germline_end = IMMCOL $ ve ,
973
979
d_germline_start = IMMCOL $ ds , d_germline_end = IMMCOL $ de , j_germline_start = IMMCOL $ js ,
974
- np1_length = " unidins" , np2_length = IMMCOL $ dnj , sequence = IMMCOL $ seq
980
+ np1_length = " unidins" , np2_length = IMMCOL $ dnj , sequence = IMMCOL $ seq ,
981
+ cdr1 = IMMCOL_EXT $ cdr1nt , cdr2 = IMMCOL_EXT $ cdr2nt ,
982
+ cdr1_aa = IMMCOL_EXT $ cdr1aa , cdr2_aa = IMMCOL_EXT $ cdr2aa ,
983
+ fwr1 = IMMCOL_EXT $ fr1nt , fwr2 = IMMCOL_EXT $ fr2nt ,
984
+ fwr3 = IMMCOL_EXT $ fr3nt , fwr4 = IMMCOL_EXT $ fr4nt ,
985
+ fwr1_aa = IMMCOL_EXT $ fr1aa , fwr2_aa = IMMCOL_EXT $ fr2aa ,
986
+ fwr3_aa = IMMCOL_EXT $ fr3aa , fwr4_aa = IMMCOL_EXT $ fr4aa
975
987
)
976
988
977
989
names(df ) <- namekey [names(df )]
@@ -993,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
993
1005
}
994
1006
}
995
1007
996
- for (column in IMMCOL $ order ) {
1008
+ order <- c(IMMCOL $ order , IMMCOL_EXT $ order [IMMCOL_EXT $ order %in% namekey ])
1009
+
1010
+ for (column in order ) {
997
1011
if (! (column %in% colnames(df ))) {
998
1012
df [column ] <- NA
999
1013
}
1000
1014
}
1001
1015
1002
- df <- df [IMMCOL $ order ]
1016
+ df <- df [order ]
1003
1017
total <- sum(df $ Clones )
1004
1018
df [IMMCOL $ prop ] <- df [IMMCOL $ count ] / total
1005
1019
df [IMMCOL $ seq ] <- stringr :: str_remove_all(df [[IMMCOL $ seq ]], " N" )
@@ -1039,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
1039
1053
.vgenes = " v_gene" , .jgenes = " j_gene" , .dgenes = " d_gene" ,
1040
1054
.vend = NA , .jstart = NA , .dstart = NA , .dend = NA ,
1041
1055
.vd.insertions = NA , .dj.insertions = NA , .total.insertions = NA ,
1042
- .skip = 0 , .sep = " ," , # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
1043
- .add = c(" chain" , " barcode" , " raw_clonotype_id" , " contig_id" , " c_gene" )
1056
+ .skip = 0 , .sep = " ," ,
1057
+ .add = c(
1058
+ " chain" , " barcode" , " raw_clonotype_id" , " contig_id" , " c_gene" ,
1059
+ " cdr1_nt" , " cdr1" , " cdr2_nt" , " cdr2" ,
1060
+ " fwr1_nt" , " fwr1" , " fwr2_nt" , " fwr2" , " fwr3_nt" , " fwr3" , " fwr4_nt" , " fwr4"
1061
+ )
1044
1062
)
1045
1063
1064
+ setnames(df , " cdr1_nt" , IMMCOL_EXT $ cdr1nt )
1065
+ setnames(df , " cdr2_nt" , IMMCOL_EXT $ cdr2nt )
1066
+ setnames(df , " cdr1" , IMMCOL_EXT $ cdr1aa )
1067
+ setnames(df , " cdr2" , IMMCOL_EXT $ cdr2aa )
1068
+ setnames(df , " fwr1_nt" , IMMCOL_EXT $ fr1nt )
1069
+ setnames(df , " fwr2_nt" , IMMCOL_EXT $ fr2nt )
1070
+ setnames(df , " fwr3_nt" , IMMCOL_EXT $ fr3nt )
1071
+ setnames(df , " fwr4_nt" , IMMCOL_EXT $ fr4nt )
1072
+ setnames(df , " fwr1" , IMMCOL_EXT $ fr1aa )
1073
+ setnames(df , " fwr2" , IMMCOL_EXT $ fr2aa )
1074
+ setnames(df , " fwr3" , IMMCOL_EXT $ fr3aa )
1075
+ setnames(df , " fwr4" , IMMCOL_EXT $ fr4aa )
1076
+
1046
1077
# Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
1047
1078
df <- df [order(df $ chain ), ]
1048
1079
setDT(df )
1049
1080
1050
1081
if (.mode == " paired" ) {
1051
- df <- df % > %
1082
+ df % < > %
1052
1083
lazy_dt() %> %
1053
- group_by( barcode , raw_clonotype_id ) %> %
1084
+ group_by_colnames( " barcode" , " raw_clonotype_id" ) %> %
1054
1085
summarise(
1086
+ CDR1.nt = paste0(get(" CDR1.nt" ), collapse = IMMCOL_ADD $ scsep ),
1087
+ CDR1.aa = paste0(get(" CDR1.aa" ), collapse = IMMCOL_ADD $ scsep ),
1088
+ CDR2.nt = paste0(get(" CDR2.nt" ), collapse = IMMCOL_ADD $ scsep ),
1089
+ CDR2.aa = paste0(get(" CDR2.aa" ), collapse = IMMCOL_ADD $ scsep ),
1055
1090
CDR3.nt = paste0(get(" CDR3.nt" ), collapse = IMMCOL_ADD $ scsep ),
1056
1091
CDR3.aa = paste0(get(" CDR3.aa" ), collapse = IMMCOL_ADD $ scsep ),
1092
+ FR1.nt = paste0(get(" FR1.nt" ), collapse = IMMCOL_ADD $ scsep ),
1093
+ FR1.aa = paste0(get(" FR1.aa" ), collapse = IMMCOL_ADD $ scsep ),
1094
+ FR2.nt = paste0(get(" FR2.nt" ), collapse = IMMCOL_ADD $ scsep ),
1095
+ FR2.aa = paste0(get(" FR2.aa" ), collapse = IMMCOL_ADD $ scsep ),
1096
+ FR3.nt = paste0(get(" FR3.nt" ), collapse = IMMCOL_ADD $ scsep ),
1097
+ FR3.aa = paste0(get(" FR3.aa" ), collapse = IMMCOL_ADD $ scsep ),
1098
+ FR4.nt = paste0(get(" FR4.nt" ), collapse = IMMCOL_ADD $ scsep ),
1099
+ FR4.aa = paste0(get(" FR4.aa" ), collapse = IMMCOL_ADD $ scsep ),
1057
1100
V.name = paste0(get(" V.name" ), collapse = IMMCOL_ADD $ scsep ),
1058
1101
J.name = paste0(get(" J.name" ), collapse = IMMCOL_ADD $ scsep ),
1059
1102
D.name = paste0(get(" D.name" ), collapse = IMMCOL_ADD $ scsep ),
@@ -1067,23 +1110,46 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
1067
1110
as.data.table()
1068
1111
}
1069
1112
1070
- df <- df % > %
1113
+ df % < > %
1071
1114
lazy_dt() %> %
1072
- group_by(CDR3.nt , V.name , J.name ) %> %
1115
+ mutate(
1116
+ CDR3.nt.sorted = sort_string(get(" CDR3.nt" ), IMMCOL_ADD $ scsep ),
1117
+ V.name.sorted = sort_string(get(" V.name" ), IMMCOL_ADD $ scsep ),
1118
+ J.name.sorted = sort_string(get(" J.name" ), IMMCOL_ADD $ scsep )
1119
+ ) %> %
1120
+ group_by_colnames(" CDR3.nt.sorted" , " V.name.sorted" , " J.name.sorted" ) %> %
1073
1121
summarise(
1074
1122
Clones = length(unique(get(" barcode" ))),
1123
+ CDR3.nt = first(get(" CDR3.nt" )),
1075
1124
CDR3.aa = first(get(" CDR3.aa" )),
1125
+ V.name = first(get(" V.name" )),
1076
1126
D.name = first(get(" D.name" )),
1127
+ J.name = first(get(" J.name" )),
1077
1128
chain = first(get(" chain" )),
1078
1129
barcode = paste0(unique(get(" barcode" )), collapse = IMMCOL_ADD $ scsep ),
1079
1130
raw_clonotype_id = gsub(
1080
1131
" clonotype|None" , " " ,
1081
1132
paste0(unique(get(" raw_clonotype_id" )), collapse = IMMCOL_ADD $ scsep )
1082
1133
),
1083
1134
contig_id = paste0(get(" contig_id" ), collapse = IMMCOL_ADD $ scsep ),
1084
- c_gene = first(get(" c_gene" ))
1135
+ c_gene = first(get(" c_gene" )),
1136
+ CDR1.nt = first(get(IMMCOL_EXT $ cdr1nt )),
1137
+ CDR2.nt = first(get(IMMCOL_EXT $ cdr2nt )),
1138
+ CDR1.aa = first(get(IMMCOL_EXT $ cdr1aa )),
1139
+ CDR2.aa = first(get(IMMCOL_EXT $ cdr2aa )),
1140
+ FR1.nt = first(get(IMMCOL_EXT $ fr1nt )),
1141
+ FR2.nt = first(get(IMMCOL_EXT $ fr2nt )),
1142
+ FR3.nt = first(get(IMMCOL_EXT $ fr3nt )),
1143
+ FR4.nt = first(get(IMMCOL_EXT $ fr4nt )),
1144
+ FR1.aa = first(get(IMMCOL_EXT $ fr1aa )),
1145
+ FR2.aa = first(get(IMMCOL_EXT $ fr2aa )),
1146
+ FR3.aa = first(get(IMMCOL_EXT $ fr3aa )),
1147
+ FR4.aa = first(get(IMMCOL_EXT $ fr4aa ))
1085
1148
) %> %
1086
- as.data.table()
1149
+ as.data.table() %> %
1150
+ subset(
1151
+ select = - c(get(" CDR3.nt.sorted" ), get(" V.name.sorted" ), get(" J.name.sorted" ))
1152
+ )
1087
1153
1088
1154
df $ V.end <- NA
1089
1155
df $ J.start <- NA
0 commit comments