sheynkman-lab
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bin/00_generate_hashids.R‎
Lines changed: 3 additions & 3 deletions b/‎bin/00_generate_hashids.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bin/02_filter_sqanti_transcripts.R‎
Lines changed: 7 additions & 7 deletions b/‎bin/02_filter_sqanti_transcripts.R‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎bin/03_filter_cpat.R‎
Lines changed: 5 additions & 5 deletions b/‎bin/03_filter_cpat.R‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎bin/04_protein_classification.R‎
Lines changed: 11 additions & 11 deletions b/‎bin/04_protein_classification.R‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎bin/04_sqanti3_protein.py‎
Lines changed: 2 additions & 2 deletions b/‎bin/04_sqanti3_protein.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/05_lr_leafcutter.R‎
Lines changed: 3 additions & 3 deletions b/‎bin/05_lr_leafcutter.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bin/build_mass_spec_reference.R‎
Lines changed: 4 additions & 4 deletions b/‎bin/build_mass_spec_reference.R‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎bin/leafcutter/__pycache__/__init__.cpython-311.pyc‎
-33 Bytes b/‎bin/leafcutter/__pycache__/__init__.cpython-311.pyc‎
-33 Bytes
diff --git a/‎bin/leafcutter/__pycache__/utils.cpython-311.pyc‎
-33 Bytes b/‎bin/leafcutter/__pycache__/utils.cpython-311.pyc‎
-33 Bytes
@@ -62,6 +62,7 @@ rsconnect/
 log_files/*
 
 results/*
+fragpipe_tools/*
 
 jurkat_test_results/*
 <<<<<<< HEAD
 
@@ -17,7 +17,7 @@
 #' - SQANTI classification file
 #' 
 #' Outputs:
-#' - *_hashids_mapping.txt
+#' - *.transcriptome.hashids_mapping.txt
 #' 
 
 # =============================================================================
@@ -184,7 +184,7 @@ extract_junction_hash = function(hash_id) {
 
 cat("\nSTEP 1: Generating hash ids for all transcripts")
 
-psl          = file.path(output_dir, paste0(basename, "_corrected.psl")) # output psl
+psl          = file.path(output_dir, paste0(basename, ".transcriptome.psl")) # output psl
 hashid_file  = file.path(output_dir, paste0(basename, "_hashids_raw.txt"))
 
 convert_gtf_to_psl(gtf_input_path  = sample_gtf, 
@@ -355,7 +355,7 @@ output_mapping %<>%
          any_of("transcript_name"),
          structural_category)
 
-mapping_output = file.path(output_dir, paste0(basename, "_hashids_mapping.txt"))
+mapping_output = file.path(output_dir, paste0(basename, ".transcriptome.hashids_mapping.txt"))
 write_tsv(output_mapping, mapping_output)
 
 n_total = nrow(output_mapping)
 
@@ -375,8 +375,8 @@ cat("\nWriting output files")
 # Build lookup from new isoform_id to original transcript_id
 id_lookup = mapping %>% select(isoform_id, original_transcript_id)
 
-write_tsv(sqanti_df_full, file.path(output_dir, paste0(basename, ".transcriptome.classification_all.txt")))
-write_tsv(sqanti_df, file.path(output_dir, paste0(basename, ".transcriptome.classification_filtered.txt")))
+#write_tsv(sqanti_df_full, file.path(output_dir, paste0(basename, ".transcriptome.classification_all.txt")))
+write_tsv(sqanti_df, file.path(output_dir, paste0(basename, ".transcriptome.filtered_SQANTI_classification.txt")))
 
 # FASTA — filter to kept transcripts and rename headers to new isoform_id
 sequences = readDNAStringSet(sqanti_fasta)
@@ -406,8 +406,8 @@ if (any(is.na(names(filtered_sequences)))) {
   stop("ERROR: Found NA values in sequence names after mapping. This should not happen.")
 }
 
-writeXStringSet(filtered_sequences, file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.fasta")))
-cat("\nSaved ", length(filtered_sequences), " sequences to ", basename, ".transcriptome.corrected_filtered.fasta")
+writeXStringSet(filtered_sequences, file.path(output_dir, paste0(basename, ".transcriptome.filtered.fasta")))
+cat("\nSaved ", length(filtered_sequences), " sequences to ", basename, ".transcriptome.filtered.fasta")
 
 
 # sample gtf
@@ -438,17 +438,17 @@ filtered_gtf %<>%
   mutate(name = paste0(transcript_id, "|", avg_ratio))
 
 gr_updated = makeGRangesFromDataFrame(filtered_gtf, keep.extra.columns = TRUE)
-gtf_output = file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.gtf"))
+gtf_output = file.path(output_dir, paste0(basename, ".transcriptome.filtered.gtf"))
 export(gr_updated, gtf_output, format = "gtf")
 
 # convert to bed12
 gtf_to_bed12(gtf_path = gtf_output,
-             output_bed = file.path(output_dir, paste0(basename, ".transcriptome.corrected_filtered.bed")),
+             output_bed = file.path(output_dir, paste0(basename, ".transcriptome.filtered.bed")),
              color_by = "avg_ratio")
 
 # Filtered hashid and cpm table
 all_ids %>% filter(isoform_id %in% kept_ids) %>%
-  write_tsv(file.path(output_dir, paste0(basename, ".transcriptome.hashids_with_cpm_filtered.txt")))
+  write_tsv(file.path(output_dir, paste0(basename, ".transcriptome.filtered_hashids_with_cpm.txt")))
 
 # Print summary
 cat("\n=== FILTERING SUMMARY ===\n")
 
@@ -16,8 +16,8 @@
 #' - Hashids mapping file (for restoring correct ID casing)
 #'
 #' Outputs:
-#' - *.predicted_proteome.all_orfs_mapped.tsv: all ORFs with genomic coordinates and quality scores
-#' - *.predicted_proteome.corrected_filtered_CDS.gtf: GTF with exon and CDS features for best ORF per transcript
+#' - *.predicted_proteome.CPAT_ORFs_mapped.tsv: all ORFs with genomic coordinates and quality scores
+#' - *.predicted_proteome.best_ORF.gtf: GTF with exon and CDS features for best ORF per transcript
 #' 
 
 # =============================================================================
@@ -353,7 +353,7 @@ plausible_orfs = mapped_orfs_classified %>% filter(orf_quality == "Clear Best OR
 best_orfs      = mapped_orfs_classified %>% filter(orf_quality == "Clear Best ORF")
 
 #write_tsv(best_orfs, file.path(cpat_dir, paste0(basename, "_best_orfs_mapped.tsv")))
-write_tsv(all_orfs, file.path(cpat_dir, paste0(basename, ".predicted_proteome.all_orfs_mapped.tsv")))
+write_tsv(all_orfs, file.path(cpat_dir, paste0(basename, ".predicted_proteome.CPAT_ORFs_mapped.tsv")))
 
 # === STEP 5: Write gtf for best ORFs, including CDS and exon types, no collapsing here ===
 cat("\nSTEP 5: Writing GTF of best ORFs with CDS and exon types")
@@ -366,7 +366,7 @@ all_cds_exons %<>%
   left_join(distinct(sample_gtf, transcript_id, gene_id), by = c("isoform_id" = "transcript_id"))
 updated_gtf = write_gtf_with_cds(sample_gtf, all_cds_exons)
 
-write.table(updated_gtf, file.path(cpat_dir, paste0(basename, ".predicted_proteome.corrected_filtered_CDS.gtf")), 
+write.table(updated_gtf, file.path(cpat_dir, paste0(basename, ".predicted_proteome.best_ORF.gtf")), 
             sep = "\t", 
             quote = FALSE, 
             row.names = FALSE, 
@@ -400,4 +400,4 @@ cat(paste0("- After further filtering, a 'Clear Best ORF' was identified for ",
 cat("\n=== CPAT FILTER OUTPUT FILES ===\n")
 cat(paste0("All CPAT ORFs with Quality Metrics: ", basename, ".predicted_proteome.all_cpat_orfs_mapped.tsv\n"))
 #cat(paste0("The single best plausible ORF per transcript: ", basename, "_best_cpat_orfs_mapped.tsv"))
-cat(paste0("GTF contains exon type for all transcripts and CDS type for transcripts with a best plausible ORF: ", basename, ".predicted_proteome.corrected_filtered_CDS.gtf"))
+cat(paste0("GTF contains exon type for all transcripts and CDS type for transcripts with a best plausible ORF: ", basename, ".predicted_proteome.best_ORF.gtf"))
@@ -52,13 +52,13 @@
 #'  
 #' Outputs: 
 #' One row per transcript (not collapsed)
-#'  - *predicted_proteome.protein_classification_all_isoforms.txt
-#'  - *.predicted.proteome.all_best_orfs.fa (for proteome reference building)
+#'  - *.predicted_proteome.best_ORF_summary.txt
+#'  - *.predicted_proteome.best_ORF.fa (for proteome reference building)
 #' 
 #' One row per unique ORF (collapsed)
-#'  - *predicted.proteome.high_confidence_ORF_cpm.txt
-#'  - *predicted.proteome.high_confidence_ORF.gtf
-#'  - *predicted.proteome.high_confidence_ORF.bed 
+#'  - *.predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt
+#'  - *.predicted_proteome.collapsed_high_confidence_ORF.gtf
+#'  - *.predicted_proteome.collapsed_high_confidence_ORF.bed
 #'
 
 
@@ -1017,7 +1017,7 @@ full_protein %<>%
                 num_junc_after_stop_codon, num_nt_after_stop_codon, num_5utr_exons, tss_in_gc_exons, utr_cat, 
                 tclass = tx_cat, pclass = protein_classification_base, psubclass = protein_classification_subset, psubclass_short, filter_status)
 
-write_tsv(full_protein, file.path(output_dir, paste0(basename, ".predicted.proteome.protein_classification_all_isoforms.txt")))
+write_tsv(full_protein, file.path(output_dir, paste0(basename, ".predicted_proteome.best_ORF_summary.txt")))
 
 # Generate a matching amino acid fasta for mass spec reference- this will include lower confidence sequences such as NMD
 full = readDNAStringSet(sample_dna_fasta_path) # dna fasta
@@ -1053,7 +1053,7 @@ orf_groups %<>%
 
 protein_seqs        = AAStringSet(orf_groups$orf_aa_sequence)
 names(protein_seqs) = orf_groups$header
-writeXStringSet(protein_seqs, file.path(output_dir, paste0(basename, ".predicted.proteome.all_best_orfs.fa")))
+writeXStringSet(protein_seqs, file.path(output_dir, paste0(basename, ".predicted_proteome.best_ORF.fa")))
 
 # =======================================================================================
 # STEP 7: Write high confidence, ORF centric GTF and count file
@@ -1075,7 +1075,7 @@ hc_collapsed = hc_orf_groups %>%
   left_join(name_map, by = "orf_base_id") %>%
   select(orf_base_id, orf_all_isoform_id, gene_id, reference_gene_name, everything())
 
-write_tsv(hc_collapsed, file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF_cpm.txt")))
+write_tsv(hc_collapsed, file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF_hashids_with_cpm.txt")))
 
 # write a corresponding ORF centric gtf
 gtf = import(sample_cds_gtf_path) %>%
@@ -1119,10 +1119,10 @@ hc_gtf %<>%
 
 # calculate ratio
 gr_updated = makeGRangesFromDataFrame(hc_gtf, keep.extra.columns = TRUE)
-export(gr_updated, file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.gtf")), format = "gtf")
+export(gr_updated, file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.gtf")), format = "gtf")
 
 # convert to bed12
-gtf_to_bed12(gtf_path = file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.gtf")),
-             output_bed = file.path(output_dir, paste0(basename, ".predicted.proteome.high_confidence_ORF.bed")),
+gtf_to_bed12(gtf_path = file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.gtf")),
+             output_bed = file.path(output_dir, paste0(basename, ".predicted_proteome.collapsed_high_confidence_ORF.bed")),
              color_by = "avg_orf_ratio",
              track_name = paste0(basename, "_predicted_proteome"))
@@ -1395,7 +1395,7 @@ def write_output(outfile, feature_map, feature_name):
 
     output_dir = args.output_dir
     output_prefix = args.output_prefix
-    output_filename = os.path.join(output_dir, output_prefix+'.sqanti_protein_classification.tsv')
+    output_filename = os.path.join(output_dir, output_prefix+'.predicted_proteome.best_ORF_SQANTI_classification.tsv')
 
     if os.path.exists(output_dir):
         if not os.path.isdir(output_dir):
@@ -1570,4 +1570,4 @@ def write_output(outfile, feature_map, feature_name):
 os.remove(args.cds_annotation_gtf)
 os.remove(os.path.join(args.output_dir, "refAnnotation_" + args.output_prefix + ".genePred"))
 os.remove(os.path.splitext(args.isoform_gff)[0] + ".genePred")
-os.remove(os.path.splitext(args.cds_isoform_gff)[0] + ".genePred")
+os.remove(os.path.splitext(args.cds_isoform_gff)[0] + ".genePred")
@@ -371,7 +371,7 @@ subisoform = by_isoform %>%
   mutate(subisoform_id = paste0(chr, ":", intron_starts, ":", intron_ends, ":clu_", cluster_idx)) %>%
   relocate((subisoform_id))
 
-write_tsv(subisoform, file = file.path(leafcutter_analysis_dir, paste0(basename, "_lr_leafcutter_subisoform_clusters.txt")), col_names = TRUE)
+write_tsv(subisoform, file = file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.subisoform_clusters.txt")), col_names = TRUE)
 
 # write out space delimited file with row names for diff splicing script
 out = subisoform %>%
@@ -380,7 +380,7 @@ out = subisoform %>%
   column_to_rownames("subisoform_id")
 
 write.table(out, 
-            file = gzfile(file.path(leafcutter_analysis_dir, paste0(basename, "_lr_leafcutter_perind_numers.counts.gz"))),
+            file = gzfile(file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.perind_numers.counts.gz"))),
             sep = " ",
             row.names = TRUE,
             col.names = TRUE,
@@ -398,4 +398,4 @@ if ("condition" %in% colnames(sample_metadata) && !("group" %in% colnames(sample
 }
 
 sample_metadata %>% select(name, group) %>%
-  write_tsv(file.path(leafcutter_analysis_dir, paste0(basename, "_groups_file.txt")), col_names = FALSE)
+  write_tsv(file.path(leafcutter_analysis_dir, paste0(basename, ".lr_leafcutter.groups_file.txt")), col_names = FALSE)
@@ -14,8 +14,8 @@
 #' always retained in full.
 #'
 #' Outputs:
-#'   - {prefix}_reference.fasta  Standardized headers: transcript_id|gene_id|pclass|status|reference_type
-#'   - {prefix}_reference.tsv    Matching metadata table
+#'   - {prefix}.proteomics.reference.fasta  Standardized headers: transcript_id|gene_id|pclass|status|reference_type
+#'   - {prefix}.proteomics.reference.tsv    Matching metadata table
 #'
 #' Usage:
 #'   Rscript build_mass_spec_reference.R --sample_name sample --lrp_fasta lrp_orfs.fasta --gencode_version 47 --counts counts.tsv --outdir results/
@@ -276,7 +276,7 @@ out_prefix = paste(prefix_parts, collapse = ".")
 cat("\nOutput prefix:", out_prefix, "\n")
 
 # Combined reference table (without sequences)
-write_tsv(select(combined_ref, -sequence), file.path(opt$outdir, paste0(out_prefix, "_reference.tsv")))
+write_tsv(select(combined_ref, -sequence), file.path(opt$outdir, paste0(out_prefix, ".proteomics.reference.tsv")))
 
 # Combined FASTA with standardized headers: transcript_id|gene_id|pclass|status|reference_type
 combined_ref %<>%
@@ -285,7 +285,7 @@ combined_ref %<>%
 fasta_out        = AAStringSet(combined_ref$sequence)
 names(fasta_out) = combined_ref$new_header
 
-writeXStringSet(fasta_out, file.path(opt$outdir, paste0(out_prefix, "_reference.fasta")))
+writeXStringSet(fasta_out, file.path(opt$outdir, paste0(out_prefix, ".proteomics.reference.fasta")))
 
 cat("\n--- Summary ---\n")
 cat("Combined reference:", nrow(combined_ref), "total entries\n")