Skip to content

Commit 0ed2e49

Browse files
authored
Merge pull request #80 from mschertzer/refactor/filenames
refactor: standardize file naming
2 parents 57a047d + f0a0079 commit 0ed2e49

42 files changed

Lines changed: 261 additions & 307 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ rsconnect/
6262
log_files/*
6363

6464
results/*
65+
fragpipe_tools/*
6566

6667
jurkat_test_results/*
6768
<<<<<<< HEAD

bin/00_generate_hashids.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#' - SQANTI classification file
1818
#'
1919
#' Outputs:
20-
#' - *_hashids_mapping.txt
20+
#' - *.transcriptome.hashids_mapping.txt
2121
#'
2222

2323
# =============================================================================
@@ -184,7 +184,7 @@ extract_junction_hash = function(hash_id) {
184184

185185
cat("\nSTEP 1: Generating hash ids for all transcripts")
186186

187-
psl = file.path(output_dir, paste0(basename, "_corrected.psl")) # output psl
187+
psl = file.path(output_dir, paste0(basename, ".transcriptome.psl")) # output psl
188188
hashid_file = file.path(output_dir, paste0(basename, "_hashids_raw.txt"))
189189

190190
convert_gtf_to_psl(gtf_input_path = sample_gtf,
@@ -355,7 +355,7 @@ output_mapping %<>%
355355
any_of("transcript_name"),
356356
structural_category)
357357

358-
mapping_output = file.path(output_dir, paste0(basename, "_hashids_mapping.txt"))
358+
mapping_output = file.path(output_dir, paste0(basename, ".transcriptome.hashids_mapping.txt"))
359359
write_tsv(output_mapping, mapping_output)
360360

361361
n_total = nrow(output_mapping)

bin/02_filter_sqanti_transcripts.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ cat("\nWriting output files")
375375
# Build lookup from new isoform_id to original transcript_id
376376
id_lookup = mapping %>% select(isoform_id, original_transcript_id)
377377

378-
write_tsv(sqanti_df_full, file.path(output_dir, paste0(basename, ".transcriptome.classification_all.txt")))
379-
write_tsv(sqanti_df, file.path(output_dir, paste0(basename, ".transcriptome.classification_filtered.txt")))
378+
#write_tsv(sqanti_df_full, file.path(output_dir, paste0(basename, ".transcriptome.classification_all.txt")))
379+
write_tsv(sqanti_df, file.path(output_dir, paste0(basename, ".transcriptome.filtered_SQANTI_classification.txt")))
380380

381381
# FASTA — filter to kept transcripts and rename headers to new isoform_id
382382
sequences = readDNAStringSet(sqanti_fasta)
@@ -406,8 +406,8 @@ if (any(is.na(names(filtered_sequences)))) {
406406
stop("ERROR: Found NA values in sequence names after mapping. This should not happen.")
407407
}
408408

409-
writeXStringSet(filtered_sequences, file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.fasta")))
410-
cat("\nSaved ", length(filtered_sequences), " sequences to ", basename, ".transcriptome.corrected_filtered.fasta")
409+
writeXStringSet(filtered_sequences, file.path(output_dir, paste0(basename, ".transcriptome.filtered.fasta")))
410+
cat("\nSaved ", length(filtered_sequences), " sequences to ", basename, ".transcriptome.filtered.fasta")
411411

412412

413413
# sample gtf
@@ -438,17 +438,17 @@ filtered_gtf %<>%
438438
mutate(name = paste0(transcript_id, "|", avg_ratio))
439439

440440
gr_updated = makeGRangesFromDataFrame(filtered_gtf, keep.extra.columns = TRUE)
441-
gtf_output = file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.gtf"))
441+
gtf_output = file.path(output_dir, paste0(basename, ".transcriptome.filtered.gtf"))
442442
export(gr_updated, gtf_output, format = "gtf")
443443

444444
# convert to bed12
445445
gtf_to_bed12(gtf_path = gtf_output,
446-
output_bed = file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.bed")),
446+
output_bed = file.path(output_dir, paste0(basename, ".transcriptome.filtered.bed")),
447447
color_by = "avg_ratio")
448448

449449
# Filtered hashid and cpm table
450450
all_ids %>% filter(isoform_id %in% kept_ids) %>%
451-
write_tsv(file.path(output_dir, paste0(basename, ".transcriptome.hashids_with_cpm_filtered.txt")))
451+
write_tsv(file.path(output_dir, paste0(basename, ".transcriptome.filtered_hashids_with_cpm.txt")))
452452

453453
# Print summary
454454
cat("\n=== FILTERING SUMMARY ===\n")

bin/03_filter_cpat.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
#' - Hashids mapping file (for restoring correct ID casing)
1717
#'
1818
#' Outputs:
19-
#' - *.predicted_proteome.all_orfs_mapped.tsv: all ORFs with genomic coordinates and quality scores
20-
#' - *.predicted_proteome.corrected_filtered_CDS.gtf: GTF with exon and CDS features for best ORF per transcript
19+
#' - *.predicted_proteome.CPAT_ORFs_mapped.tsv: all ORFs with genomic coordinates and quality scores
20+
#' - *.predicted_proteome.best_ORF.gtf: GTF with exon and CDS features for best ORF per transcript
2121
#'
2222

2323
# =============================================================================
@@ -353,7 +353,7 @@ plausible_orfs = mapped_orfs_classified %>% filter(orf_quality == "Clear Best OR
353353
best_orfs = mapped_orfs_classified %>% filter(orf_quality == "Clear Best ORF")
354354

355355
#write_tsv(best_orfs, file.path(cpat_dir, paste0(basename, "_best_orfs_mapped.tsv")))
356-
write_tsv(all_orfs, file.path(cpat_dir, paste0(basename, ".predicted_proteome.all_orfs_mapped.tsv")))
356+
write_tsv(all_orfs, file.path(cpat_dir, paste0(basename, ".predicted_proteome.CPAT_ORFs_mapped.tsv")))
357357

358358
# === STEP 5: Write gtf for best ORFs, including CDS and exon types, no collapsing here ===
359359
cat("\nSTEP 5: Writing GTF of best ORFs with CDS and exon types")
@@ -366,7 +366,7 @@ all_cds_exons %<>%
366366
left_join(distinct(sample_gtf, transcript_id, gene_id), by = c("isoform_id" = "transcript_id"))
367367
updated_gtf = write_gtf_with_cds(sample_gtf, all_cds_exons)
368368

369-
write.table(updated_gtf, file.path(cpat_dir, paste0(basename, ".predicted_proteome.corrected_filtered_CDS.gtf")),
369+
write.table(updated_gtf, file.path(cpat_dir, paste0(basename, ".predicted_proteome.best_ORF.gtf")),
370370
sep = "\t",
371371
quote = FALSE,
372372
row.names = FALSE,
@@ -400,4 +400,4 @@ cat(paste0("- After further filtering, a 'Clear Best ORF' was identified for ",
400400
cat("\n=== CPAT FILTER OUTPUT FILES ===\n")
401401
cat(paste0("All CPAT ORFs with Quality Metrics: ", basename, ".predicted_proteome.all_cpat_orfs_mapped.tsv\n"))
402402
#cat(paste0("The single best plausible ORF per transcript: ", basename, "_best_cpat_orfs_mapped.tsv"))
403-
cat(paste0("GTF contains exon type for all transcripts and CDS type for transcripts with a best plausible ORF: ", basename, ".predicted_proteome.corrected_filtered_CDS.gtf"))
403+
cat(paste0("GTF contains exon type for all transcripts and CDS type for transcripts with a best plausible ORF: ", basename, ".predicted_proteome.best_ORF.gtf"))

bin/04_protein_classification.R

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,13 @@
5252
#'
5353
#' Outputs:
5454
#' One row per transcript (not collapsed)
55-
#' - *predicted_proteome.protein_classification_all_isoforms.txt
56-
#' - *.predicted.proteome.all_best_orfs.fa (for proteome reference building)
55+
#' - *.predicted_proteome.best_ORF_summary.txt
56+
#' - *.predicted_proteome.best_ORF.fa (for proteome reference building)
5757
#'
5858
#' One row per unique ORF (collapsed)
59-
#' - *predicted.proteome.high_confidence_ORF_cpm.txt
60-
#' - *predicted.proteome.high_confidence_ORF.gtf
61-
#' - *predicted.proteome.high_confidence_ORF.bed
59+
#' - *.predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt
60+
#' - *.predicted_proteome.collapsed_high_confidence_ORF.gtf
61+
#' - *.predicted_proteome.collapsed_high_confidence_ORF.bed
6262
#'
6363

6464

@@ -1017,7 +1017,7 @@ full_protein %<>%
10171017
num_junc_after_stop_codon, num_nt_after_stop_codon, num_5utr_exons, tss_in_gc_exons, utr_cat,
10181018
tclass = tx_cat, pclass = protein_classification_base, psubclass = protein_classification_subset, psubclass_short, filter_status)
10191019

1020-
write_tsv(full_protein, file.path(output_dir, paste0(basename, ".predicted.proteome.protein_classification_all_isoforms.txt")))
1020+
write_tsv(full_protein, file.path(output_dir, paste0(basename, ".predicted_proteome.best_ORF_summary.txt")))
10211021

10221022
# Generate a matching amino acid fasta for mass spec reference- this will include lower confidence sequences such as NMD
10231023
full = readDNAStringSet(sample_dna_fasta_path) # dna fasta
@@ -1053,7 +1053,7 @@ orf_groups %<>%
10531053

10541054
protein_seqs = AAStringSet(orf_groups$orf_aa_sequence)
10551055
names(protein_seqs) = orf_groups$header
1056-
writeXStringSet(protein_seqs, file.path(output_dir, paste0(basename, ".predicted.proteome.all_best_orfs.fa")))
1056+
writeXStringSet(protein_seqs, file.path(output_dir, paste0(basename, ".predicted_proteome.best_ORF.fa")))
10571057

10581058
# =======================================================================================
10591059
# STEP 7: Write high confidence, ORF centric GTF and count file
@@ -1075,7 +1075,7 @@ hc_collapsed = hc_orf_groups %>%
10751075
left_join(name_map, by = "orf_base_id") %>%
10761076
select(orf_base_id, orf_all_isoform_id, gene_id, reference_gene_name, everything())
10771077

1078-
write_tsv(hc_collapsed, file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF_cpm.txt")))
1078+
write_tsv(hc_collapsed, file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt")))
10791079

10801080
# write a corresponding ORF centric gtf
10811081
gtf = import(sample_cds_gtf_path) %>%
@@ -1119,10 +1119,10 @@ hc_gtf %<>%
11191119

11201120
# calculate ratio
11211121
gr_updated = makeGRangesFromDataFrame(hc_gtf, keep.extra.columns = TRUE)
1122-
export(gr_updated, file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.gtf")), format = "gtf")
1122+
export(gr_updated, file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.gtf")), format = "gtf")
11231123

11241124
# convert to bed12
1125-
gtf_to_bed12(gtf_path = file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.gtf")),
1126-
output_bed = file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.bed")),
1125+
gtf_to_bed12(gtf_path = file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.gtf")),
1126+
output_bed = file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.bed")),
11271127
color_by = "avg_orf_ratio",
11281128
track_name = paste0(basename, "_predicted_proteome"))

bin/04_sqanti3_protein.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,7 +1395,7 @@ def write_output(outfile, feature_map, feature_name):
13951395

13961396
output_dir = args.output_dir
13971397
output_prefix = args.output_prefix
1398-
output_filename = os.path.join(output_dir, output_prefix+'.sqanti_protein_classification.tsv')
1398+
output_filename = os.path.join(output_dir, output_prefix+'.predicted_proteome.best_ORF_SQANTI_classification.tsv')
13991399

14001400
if os.path.exists(output_dir):
14011401
if not os.path.isdir(output_dir):
@@ -1570,4 +1570,4 @@ def write_output(outfile, feature_map, feature_name):
15701570
os.remove(args.cds_annotation_gtf)
15711571
os.remove(os.path.join(args.output_dir, "refAnnotation_" + args.output_prefix + ".genePred"))
15721572
os.remove(os.path.splitext(args.isoform_gff)[0] + ".genePred")
1573-
os.remove(os.path.splitext(args.cds_isoform_gff)[0] + ".genePred")
1573+
os.remove(os.path.splitext(args.cds_isoform_gff)[0] + ".genePred")

bin/05_lr_leafcutter.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ subisoform = by_isoform %>%
371371
mutate(subisoform_id = paste0(chr, ":", intron_starts, ":", intron_ends, ":clu_", cluster_idx)) %>%
372372
relocate((subisoform_id))
373373

374-
write_tsv(subisoform, file = file.path(leafcutter_analysis_dir, paste0(basename, "_lr_leafcutter_subisoform_clusters.txt")), col_names = TRUE)
374+
write_tsv(subisoform, file = file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.subisoform_clusters.txt")), col_names = TRUE)
375375

376376
# write out space delimited file with row names for diff splicing script
377377
out = subisoform %>%
@@ -380,7 +380,7 @@ out = subisoform %>%
380380
column_to_rownames("subisoform_id")
381381

382382
write.table(out,
383-
file = gzfile(file.path(leafcutter_analysis_dir, paste0(basename, "_lr_leafcutter_perind_numers.counts.gz"))),
383+
file = gzfile(file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.perind_numers.counts.gz"))),
384384
sep = " ",
385385
row.names = TRUE,
386386
col.names = TRUE,
@@ -398,4 +398,4 @@ if ("condition" %in% colnames(sample_metadata) && !("group" %in% colnames(sample
398398
}
399399

400400
sample_metadata %>% select(name, group) %>%
401-
write_tsv(file.path(leafcutter_analysis_dir, paste0(basename, "_groups_file.txt")), col_names = FALSE)
401+
write_tsv(file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.groups_file.txt")), col_names = FALSE)

bin/build_mass_spec_reference.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
#' always retained in full.
1515
#'
1616
#' Outputs:
17-
#' - {prefix}_reference.fasta Standardized headers: transcript_id|gene_id|pclass|status|reference_type
18-
#' - {prefix}_reference.tsv Matching metadata table
17+
#' - {prefix}.proteomics.reference.fasta Standardized headers: transcript_id|gene_id|pclass|status|reference_type
18+
#' - {prefix}.proteomics.reference.tsv Matching metadata table
1919
#'
2020
#' Usage:
2121
#' Rscript build_mass_spec_reference.R --sample_name sample --lrp_fasta lrp_orfs.fasta --gencode_version 47 --counts counts.tsv --outdir results/
@@ -276,7 +276,7 @@ out_prefix = paste(prefix_parts, collapse = ".")
276276
cat("\nOutput prefix:", out_prefix, "\n")
277277

278278
# Combined reference table (without sequences)
279-
write_tsv(select(combined_ref, -sequence), file.path(opt$outdir, paste0(out_prefix, "_reference.tsv")))
279+
write_tsv(select(combined_ref, -sequence), file.path(opt$outdir, paste0(out_prefix, ".proteomics.reference.tsv")))
280280

281281
# Combined FASTA with standardized headers: transcript_id|gene_id|pclass|status|reference_type
282282
combined_ref %<>%
@@ -285,7 +285,7 @@ combined_ref %<>%
285285
fasta_out = AAStringSet(combined_ref$sequence)
286286
names(fasta_out) = combined_ref$new_header
287287

288-
writeXStringSet(fasta_out, file.path(opt$outdir, paste0(out_prefix, "_reference.fasta")))
288+
writeXStringSet(fasta_out, file.path(opt$outdir, paste0(out_prefix, ".proteomics.reference.fasta")))
289289

290290
cat("\n--- Summary ---\n")
291291
cat("Combined reference:", nrow(combined_ref), "total entries\n")
-33 Bytes
Binary file not shown.
-33 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)