refresh-bio
diff --git a/‎.github/workflows/large.yml‎
Lines changed: 115 additions & 4 deletions b/‎.github/workflows/large.yml‎
Lines changed: 115 additions & 4 deletions
diff --git a/‎3rd_party/ref-utils‎ b/‎3rd_party/ref-utils‎
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎example/README.txt‎
Lines changed: 11 additions & 0 deletions b/‎example/README.txt‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎example/datasets/README.txt‎
Lines changed: 13 additions & 0 deletions b/‎example/datasets/README.txt‎
Lines changed: 13 additions & 0 deletions
@@ -4,7 +4,118 @@ on:
   workflow_dispatch:
 
 jobs:
-  dummy:
-    name: dummy
-    runs-on: echo
-  
+
+ ########################################################################################
+  checkout:
+    name: Checkout
+    runs-on: [self-hosted, vclust, x64_linux, large]
+    
+    steps:
+    - name: clean
+      run: rm -rf ${{ github.workspace }}/*
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - name: Get tags
+      run: | 
+        cd ./3rd_party/clusty/libs/igraph
+        git fetch --prune --unshallow
+        echo exit code $?
+        git tag --list
+      continue-on-error: true
+
+ ########################################################################################
+  download-release:
+    name: Download release
+    needs: checkout
+    strategy:
+      matrix:
+        compiler: [14]
+    runs-on: [self-hosted, vclust, x64_linux, large]
+    
+    steps:
+#    - name: clean
+#      run: rm -rf ${{ github.workspace }}/*
+#    - uses: robinraju/[email protected]
+#      with:
+#        latest: true
+#        tarBall: true
+#        extract: true
+#        token: ${{ secrets.MY_TOKEN }}
+#    - name: download
+#      run: ./.github/workflows/github-release-downloader.sh refresh-bio vclust-dev "x64_linux.tar.gz"
+    - name: make
+      run: gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=avx2 LEIDEN=true STATIC_LINK=true
+    - name: print info
+      run: python3 vclust.py info 
+
+ ######################################################################################## 
+  ani:
+    name: ANI calculation
+    needs: download-release
+    strategy:
+      fail-fast: false
+      matrix:
+        dataset: [ICTV, IMGVR]
+        include:
+        - dataset: ICTV
+          variant_name: full 
+          prefilter_args: '-k 25 --min-ident 0.7 --min-kmers 20'
+          align_args: '--out-tani 0.70'
+        - dataset: IMGVR_HQ 
+          variant_name: full 
+          prefilter_args: '-k 25 --min-ident 0.95 --min-kmers 20 --batch-size 1000000'
+          align_args: '--out-ani 0.95 --out-qcov 0.85'
+        - dataset: IMGVR 
+          variant_name: fraction_02
+          prefilter_args: '-k 25 --min-ident 0.95 --min-kmers 4 --kmers-fraction 0.2 --batch-size 2000000'
+          align_args: '--out-ani 0.95 --out-qcov 0.85'
+    env:
+      INPUT_DIR: ../../../../vclust/input
+      TEMP_DIR: ../../../../vclust/temp
+       
+    runs-on: [self-hosted, vclust, x64_linux, large] 
+
+    steps:
+    - name: prefilter
+      run: /usr/bin/time -v ./vclust.py prefilter -t 32 -i ${INPUT_DIR}/${{ matrix.dataset }}.fna.gz -o ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.filter ${{ matrix.prefilter_args }}
+    - name: prefilter md5
+      run: md5sum ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.filter
+    - name: align
+      run: /usr/bin/time -v ./vclust.py align -t 32 -i ${INPUT_DIR}/${{ matrix.dataset }}.fna.gz -o ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.ani.tsv --filter ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.filter ${{ matrix.align_args }}
+    - name: align md5
+      run: md5sum ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.ani.tsv ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.ani.ids.tsv
+ 
+ ######################################################################################## 
+  clustering:
+    name: clustering
+    needs: ani
+    strategy:
+      fail-fast: false
+      matrix:
+        dataset: [ICTV, IMGVR, IMGVR_HQ]
+        algo_name: [single, complete, set-cover, uclust, cd-hit, leiden_07, leiden_10]
+        include:
+        - {dataset: ICTV, variant_name: full, args: '--metric tani --tani 0.95'}
+        - {dataset: IMGVR, variant_name: fraction_02, args: '--metric ani --ani 0.95 --qcov 0.85'}
+        - {dataset: IMGVR_HQ, variant_name: full, args: '--metric ani --ani 0.95 --qcov 0.85'}
+        - {algo_name: single, algo_cmd: single}
+        - {algo_name: complete, algo_cmd: complete}
+        - {algo_name: set-cover, algo_cmd: set-cover}
+        - {algo_name: uclust, algo_cmd: uclust}
+        - {algo_name: cd-hit, algo_cmd: cd-hit}
+        - {algo_name: leiden_07, algo_cmd: 'leiden --leiden-resolution 0.7'}
+        - {algo_name: leiden_10, algo_cmd: 'leiden --leiden-resolution 1.0'}
+     
+    env:
+      INPUT_DIR: ../../../../vclust/input
+      TEMP_DIR: ../../../../vclust/temp
+       
+    runs-on: [self-hosted, vclust, x64_linux, large] 
+
+    steps:
+    - name: cluster
+      run: /usr/bin/time -v ./vclust.py cluster -i ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.ani.tsv --ids ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.ani.ids.tsv -o ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.${{ matrix.algo_name }}.clusty --algorithm ${{ matrix.algo_cmd }} ${{ matrix.args }}
+    - name: md5
+      run: md5sum ${TEMP_DIR}/${{ matrix.dataset }}.${{ matrix.variant_name }}.${{ matrix.algo_name }}.clusty
+        
@@ -80,15 +80,17 @@ The Vclust documentation is available on the [GitHub Wiki](https://github.com/re
    2. Prefilter
    3. Align
    4. Cluster
+   5. Deduplicate
 5. [Optimizing sensitivity and resource usage](https://github.com/refresh-bio/vclust/wiki/5-Optimizing-sensitivity-and-resource-usage)
 6. [Use cases](https://github.com/refresh-bio/vclust/wiki/6-Use-cases)
    1. Classify viruses into species and genera following ICTV standards
    2. Assign viral contigs into vOTUs following MIUViG standards
    3. Dereplicate viral contigs into representative genomes
-   4. Calculate pairwise similarities between all-versus-all genomes
-   5. Process large dataset of diverse virus genomes (IMG/VR)
+   4. Process large dataset of diverse virus genomes (IMG/VR)
+   5. Deduplicate (remove duplicate sequences) between and within multiple datasets
    6. Process large dataset of highly redundant virus genomes
    7. Cluster plasmid genomes into pOTUs
+   8. Calculate pairwise similarities between all-versus-all genomes
 7. [FAQ: Frequently Asked Questions](https://github.com/refresh-bio/vclust/wiki/7-FAQ:-Frequently-Asked-Questions)
 
 
 
@@ -0,0 +1,11 @@
+This dataset comprises bacteriophage genome sequences with simulated mutations relative to the reference sequence. Mutations include substitutions (sn), deletions (del), insertions (ins), duplications (dup), inversions (inv), and translocations (tl). These modified sequences (.alt*) have known true total ANI (tANI) values compared to the reference.
+
+ref_id      alt_id          ref_len alt_len tani    alt_summary
+NC_010807   NC_010807.alt1  38815   38815   0.99753 sn;inv;tl
+NC_010807   NC_010807.alt2  38815   40555   0.98985 sn;dup
+NC_010807   NC_010807.alt3  38815   39891   0.98414 sn;ins;tl
+NC_005091   NC_005091.alt1  57455   57455   0.97161 sn;inv;tl
+NC_005091   NC_005091.alt2  57455   63696   0.96707 sn;dup;tl
+NC_025457   NC_025457.alt1  42654   41066   0.80607 sn;del;ins;dup;inv
+NC_025457   NC_025457.alt2  42654   64164   0.75921 sn;del;ins;dup;inv;tl
+NC_002486   NC_002486.alt   45636   45636   1.00000 tl
@@ -0,0 +1,13 @@
+Duplicate sequences (identical sequences)
+
+refseq.fna      genbank.fna    other
+NC_002486.1  =  AB044554.1     
+NC_005091.2  =  AY357582.2  =  AY357582.2_duplicate
+NC_010807.1  =  EU547803.1  =  NC_010807.1_duplicate
+NC_025457.1  =  KJ473423.1     
+                MN428048.1  =  MN428048.1_revcomp
+                MK937595.1     
+                               Mushuvirus = Mushuvirus_copy
+
+
+7 unique sequences