Skip to content

Commit a49fcf8

Browse files
authored
Merge pull request #415 from sanger-tol/dp24_fofn_function
Dp24 fofn function
2 parents 9f4e17a + 6a73f51 commit a49fcf8

25 files changed

+277
-121
lines changed

CHANGELOG.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,21 @@ Our 7th release for sanger-tol/treeval.
1616
- Addition of `run_hires` flag (boolean) to control use of the hires pretext modules.
1717
- Removal of unused modules such as `avgcov`, the average coverage module.
1818
- Addition of the `mode` parameter, to replace the depreciating `-entry` flag.
19+
- Mode lists are include lists, listing the processes needed per run.
1920
- Removing entry points has significantly simplified the pipeline; `mode` now controls a set of `steps` for pipeline execution.
2021
- Addition of specified reads.
2122
- Longreads and HiC are now specified in the input yaml file. Details in the usage document.
22-
- Longread and HiC data can optionally take a fofn (file of file names) where each line contains one file.
23-
- Corrected input values by @DLBPointon
23+
- Longread and HiC data can optionally take a fofn (file of file names) where each line contains one file, see `YAML_INPUT` function `fn_get_validated_channel`.
24+
- NOTE: in cases where the same file is in both the fofn and a supplied list, the pipeline will exit with an error which will be detailed in the nextflow.log file.
25+
- Corrected input values.
2426
- SummaryStats Code has been removed, this will be replaced by a plugin in the future!
2527
- Replace 5 modules with GAWK instead of cat | sed pattern modules.
2628
- Simplification of the `YAML_INPUT` subworkflow.
2729
- Removal of `GrabFiles` and replacement with the newer `.resolve()`.
2830
- Cleaning up of `it -> it[0]` into the more correct `_meta, file -> file` pattern.
31+
- Moved GAWK commands into their own awk script files, this cleans up modules.config.
32+
- Added `my_abs` function to the `GAWK_REFORMAT_INTERSECT` to caluclate difference between start and end.
33+
- Added `[TreeVal:Error]` and `[TreeVal:Info]` into intentional print statements to make it easier to see on the CLI and to search for in logs.
2934

3035
### Parameters
3136

assets/github_testing/TreeValTinyFullTest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ hic_data:
2020
kmer_profile:
2121
# kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
2222
kmer_length: 31
23-
dir: /home/runner/work/treeval/treeval/TreeValTinyData/
23+
profile: /home/runner/work/treeval/treeval/TreeValTinyData/
2424
alignment:
2525
genesets:
2626
- /home/runner/work/treeval/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
assembly:
2+
assem_level: scaffold
3+
assem_version: 1
4+
sample_id: grTriPseu1
5+
latin_name: to_provide_taxonomic_rank
6+
defined_class: nematode
7+
project_id: DTOL
8+
reference_file: /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa
9+
map_order: length
10+
assem_reads:
11+
read_type: hifi
12+
read_data:
13+
- /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/assets/local_testing/grTriPseu1_reads.fofn
14+
supplementary_data: path
15+
hic_data:
16+
hic_cram:
17+
- /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/assets/local_testing/grTriPseu1_hic.fofn
18+
hic_aligner: minimap2
19+
kmer_profile:
20+
# kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
21+
kmer_length: 31
22+
profile: /lustre/scratch122/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
23+
alignment:
24+
genesets:
25+
- /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/gene_alignment_data/fungi/csv_data/LaetiporusSulphureus.gfLaeSulp1-data.csv
26+
self_comp:
27+
motif_len: 0
28+
intron:
29+
size: "50k"
30+
telomere:
31+
teloseq: TTAGGG
32+
synteny:
33+
- /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/synteny/fungi/LaetiporusSulphureus.fasta
34+
busco:
35+
lineages_path: /nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/busco/subset/
36+
lineage: fungi_odb10
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-1000.cram
2+
/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/hic-arima/SUBSET-2000.cram
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/nfs/treeoflife-01/teams/tola/users/dp24/new/new-new/treeval/TreeValTinyData/genomic_data/pacbio/seqkitPacbio50000.fasta.gz

assets/local_testing/nxOscDF5033.yaml

Lines changed: 0 additions & 39 deletions
This file was deleted.

assets/local_testing/nxOscDF5033_hic.fofn

Lines changed: 0 additions & 2 deletions
This file was deleted.

assets/local_testing/nxOscDF5033_reads.fofn

Lines changed: 0 additions & 2 deletions
This file was deleted.

assets/local_testing/nxOscSUBSET.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,24 @@ assembly:
55
latin_name: to_provide_taxonomic_rank
66
defined_class: nematode
77
project_id: DTOL
8-
reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/assembly/draft/SUBSET_genome/Oscheius_SUBSET.fasta
8+
reference_file: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/assembly/draft/SUBSET_genome/Oscheius_SUBSET.fasta
99
map_order: length
1010
assem_reads:
1111
read_type: hifi
1212
read_data:
13-
- /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_SUBSET/genomic_data/pacbio/subset.fasta.gz
13+
- /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/pacbio/subset.fasta.gz
1414
supplementary_data: path
1515
hic_data:
16-
hic_cram: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/hic-arima2/subset/
17-
hic_aligner: minimap2
16+
hic_cram:
17+
- /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/hic-arima2/SUBSET-1000.cram
18+
- /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/hic-arima2/SUBSET-2000.cram
1819
kmer_profile:
1920
# kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
2021
kmer_length: 31
21-
dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/Oscheius_DF5033/genomic_data/nxOscSpes1/pacbio/
22+
dir: /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/genomic_data/nxOscSpes1/pacbio/
2223
alignment:
2324
genesets:
24-
- /lustre/scratch123/tol/resources/treeval/gene_alignment_data/nematode/csv_data/OscheiusTipulae.ASM1342590v1-data.csv
25+
- /lustre/scratch123/tol/resources/treeval/treeval-testdata/TreeValSmallData/gene_set/nematode/csv_data/CaenorhabditisElegans.WBcel235-data.csv
2526
self_comp:
2627
motif_len: 0
2728
intron:

assets/schema_input.json

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,15 @@
4848
"properties": {
4949
"read_type": {
5050
"type": "string",
51-
"errorMessage": "Long read data type"
51+
"errorMessage": "Long read data type",
52+
"enum": ["hifi", "clr", "ont", "illumina"]
5253
},
5354
"read_data": {
5455
"type": "array",
55-
"errorMessage": "Paths to folder containing fasta.gz files or FOFN of files"
56+
"errorMessage": "Paths to fasta.gz files and/or FOFN of fasta.gz files",
57+
"pattern": "(^\\S+\\.(fa|fasta|fna).gz)$|(^\\S+\\.fofn)$",
58+
"format": "file-path",
59+
"exists": true
5660
},
5761
"supplementary_data": {
5862
"type": "string",
@@ -65,11 +69,15 @@
6569
"properties": {
6670
"hic_cram": {
6771
"type": "array",
68-
"errorMessage": "Paths to folder containing cram files of FOFN of cram"
72+
"errorMessage": "Paths to an array of cram files and/or FOFN of cram files",
73+
"pattern": "(^\\S+\\.cram)$|(^\\S+\\.fofn)$",
74+
"format": "file-path",
75+
"exists": true
6976
},
7077
"hic_aligner": {
7178
"type": "string",
72-
"errorMessage": "HiC Aligner - minimap2"
79+
"errorMessage": "HiC Aligner - minimap2 (default), bwamem2",
80+
"enum": ["minimap2", "bwamem2"]
7381
}
7482
}
7583
},
@@ -80,9 +88,9 @@
8088
"type": "integer",
8189
"errorMessage": "K-mer length"
8290
},
83-
"dir": {
91+
"profile": {
8492
"type": "string",
85-
"errorMessage": "Path to folder containing fasta.gz files"
93+
"errorMessage": "Path to *.hist file"
8694
}
8795
}
8896
},
@@ -91,7 +99,8 @@
9199
"properties": {
92100
"genesets": {
93101
"type": "array",
94-
"errorMessage": "Path for gene alignment file."
102+
"errorMessage": "Path for gene alignment file.",
103+
"pattern": "^\\S+\\.csv$"
95104
}
96105
}
97106
},
@@ -123,7 +132,8 @@
123132
}
124133
},
125134
"synteny": {
126-
"type": "array"
135+
"type": "array",
136+
"pattern": "^\\S+\\.(fa|fasta|fna)$"
127137
},
128138
"busco": {
129139
"type": "object",

0 commit comments

Comments
 (0)