diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deprecated/deploy.yaml similarity index 100% rename from .github/workflows/deploy.yaml rename to .github/workflows/deprecated/deploy.yaml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml deleted file mode 100644 index a9a4dcc..0000000 --- a/.github/workflows/python-app.yml +++ /dev/null @@ -1,36 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: diploshic - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.10 - uses: actions/setup-python@v2 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml new file mode 100644 index 0000000..aeda45b --- /dev/null +++ b/.github/workflows/python-build.yml @@ -0,0 +1,37 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Build + +on: push + +jobs: + Build_Package: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Setup conda + uses: s-weigand/setup-conda@v1 + with: + activate-conda: true + update-conda: true + python-version: ${{ matrix.python-version }} + conda-channels: conda-forge + - name: Install dependencies + run: | + conda install pip setuptools + pip install --upgrade pip + - name: Install diploSHIC + run: | + pip install . + - name: List installed + run: | + conda list \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 2f80821..3469abf 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -6,12 +6,18 @@ # separate terms of service, privacy policy, and support # documentation. -name: Upload Python Package +name: Publish -on: [push, pull_request] +on: + push: + branches: + - main + - master + tags: + - v* jobs: - manylinux: + Build_Wheel: runs-on: ubuntu-latest steps: - name: Checkout diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c335f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +build +diploSHIC.egg-info +work \ No newline at end of file diff --git a/README.md b/README.md index fed2df7..8f888a5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +[![Build](https://github.com/kr-colab/diploSHIC/actions/workflows/python-build.yml/badge.svg)](https://github.com/kr-colab/diploSHIC/actions/workflows/python-build.yml) +[![PyPI version](https://badge.fury.io/py/diploSHIC.svg)](https://badge.fury.io/py/diploSHIC) + # diploS/HIC This repo contains the implementation for `diploS/HIC` as described in Kern and Schrider (2018; https://doi.org/10.1534/g3.118.200262), along with its associated support scripts. `diploS/HIC` uses a deep convolutional neural network to identify diff --git a/diploshic/__init__.py b/diploshic/__init__.py index 4b4f13d..cb13fc1 100644 --- a/diploshic/__init__.py +++ b/diploshic/__init__.py @@ -1,3 +1,7 @@ from diploshic.fvTools import * from diploshic.msTools import * from diploshic.shicstats import * +from . import network +from . import dataloader +from . import misc +from . import parser \ No newline at end of file diff --git a/diploshic/dataloader.py b/diploshic/dataloader.py new file mode 100644 index 0000000..abc8485 --- /dev/null +++ b/diploshic/dataloader.py @@ -0,0 +1,101 @@ +from keras.utils import Sequence +import numpy as np +import gc + + +def load_fvecs_from_directory(directory, n_subwin=11): + hard = np.loadtxt(directory + "hard.fvec", skiprows=1) + nDims = int(hard.shape[1] / n_subwin) + h1 = np.reshape(hard, (hard.shape[0], nDims, n_subwin)) + neut = np.loadtxt(directory + "neut.fvec", skiprows=1) + n1 = np.reshape(neut, (neut.shape[0], nDims, n_subwin)) + soft = np.loadtxt(directory + "soft.fvec", skiprows=1) + s1 = np.reshape(soft, (soft.shape[0], nDims, n_subwin)) + lsoft = np.loadtxt(directory + "linkedSoft.fvec", skiprows=1) + ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, n_subwin)) + lhard = np.loadtxt(directory + "linkedHard.fvec", skiprows=1) + lh1 = np.reshape(lhard, (lhard.shape[0], nDims, n_subwin)) + both = np.concatenate((h1, n1, s1, ls1, lh1)) + y = np.concatenate((np.repeat(0, len(h1)), + np.repeat(1, len(n1)), + np.repeat(2, len(s1)), + np.repeat(3, len(ls1)), + np.repeat(4, len(lh1)),)) + return both.reshape(both.shape[0], nDims, n_subwin, 1), y + + +def load_empirical_fvecs_from_directory(directory, n_subwin=11): + nDims = int(emp.shape[1] / n_subwin) + emp = np.loadtxt(directory + "empirical.fvec", skiprows=1) + emp = np.reshape(emp, (emp.shape[0], nDims, n_subwin)) + return emp.reshape(emp, emp.shape[0], nDims, n_subwin, 1) + + +class DADiploSHICDataLoader(Sequence): + def __init__(self, X_src, X_tgt, Y_pred, batch_size): + self.tgt_data = X_tgt + self.src_data = X_src + self.y_pred = Y_pred + + self.batch_size = batch_size + src_size = self.src_data.shape[0] + tgt_size = self.tgt_data.shape[0] + + self.no_batch = int(np.floor(np.minimum(src_size, tgt_size) / self.batch_size)) # model sees training sample at most once per epoch + self.src_pred_idx = np.arange(src_size) + self.src_discr_idx = np.arange(src_size) + self.tgt_discr_idx = np.arange(tgt_size) + + np.random.shuffle(self.src_pred_idx) + np.random.shuffle(self.src_discr_idx) + np.random.shuffle(self.tgt_discr_idx) + + def __len__(self): + return self.no_batch + + def on_epoch_end(self): + np.random.shuffle(self.src_pred_idx) + np.random.shuffle(self.src_discr_idx) + np.random.shuffle(self.tgt_discr_idx) + gc.collect() + + def __getitem__(self, idx): + pred_batch_idx = self.src_pred_idx[idx*self.batch_size:(idx+1)*self.batch_size] + discrSrc_batch_idx = self.src_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)] + discrTgt_batch_idx = self.tgt_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)] + batch_X = np.concatenate((self.src_data[pred_batch_idx], + self.src_data[discrSrc_batch_idx], + self.tgt_data[discrTgt_batch_idx])) + batch_Y_pred = np.concatenate((self.y_pred[pred_batch_idx], + -1*np.ones((len(discrSrc_batch_idx), self.y_pred.shape[1])), + -1*np.ones((len(discrTgt_batch_idx), self.y_pred.shape[1])))) + batch_Y_discr = np.concatenate((-1*np.ones(len(pred_batch_idx)), + np.zeros(len(discrSrc_batch_idx)), + np.ones(len(discrTgt_batch_idx)))) + assert batch_X.shape[0] == self.batch_size*2, (batch_X.shape, self.batch_size*2) + assert batch_Y_pred.shape[0] == batch_Y_discr.shape[0], (batch_Y_pred.shape, batch_Y_discr.shape) + return batch_X, {"predictor":batch_Y_pred, "discriminator":batch_Y_discr} + + +class DiploSHICDataLoader(Sequence): + def __init__(self, X_src, Y_pred, batch_size): + self.data = X_src + self.y_pred = Y_pred + self.batch_size = batch_size + size = self.data.shape[0] + self.no_batch = int(np.floor(size/ self.batch_size)) + self.pred_idx = np.arange(size) + np.random.shuffle(self.pred_idx) + + def __len__(self): + return self.no_batch + + def on_epoch_end(self): + np.random.shuffle(self.pred_idx) + gc.collect() + + def __getitem__(self, idx): + pred_batch_idx = self.pred_idx[idx*self.batch_size:(idx+1)*self.batch_size] + batch_X = self.data[pred_batch_idx] + batch_Y_pred = self.y_pred[pred_batch_idx] + return batch_X, batch_Y_pred diff --git a/diploshic/diploSHIC b/diploshic/diploSHIC index f0134bc..918567e 100644 --- a/diploshic/diploSHIC +++ b/diploshic/diploSHIC @@ -1,6 +1,13 @@ #!/usr/bin/env python +import os, json +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -import argparse, time, sys, subprocess +import time, sys, subprocess +from diploshic.parser import load_arg_dict + +TEST_FRAC = 0.1 +VALID_FRAC = 0.5 +BATCH_SIZE = 100 pyExec = sys.executable print(pyExec) @@ -8,538 +15,199 @@ if "/" in sys.argv[0]: diploShicDir = "/".join(sys.argv[0].split("/")[:-1]) + "/" else: diploShicDir = "" + +argsDict = load_arg_dict() -################# use argparse to get CL args and make sure we are kosher -parser = argparse.ArgumentParser( - description="calculate feature vectors, train, or predict with diploSHIC" -) -parser._positionals.title = "possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message" -subparsers = parser.add_subparsers(help="sub-command help") -parser_c = subparsers.add_parser( - "fvecSim", help="Generate feature vectors from simulated data" -) -parser_e = subparsers.add_parser( - "makeTrainingSets", - help="Combine feature vectors from muliple fvecSim runs into 5 balanced training sets", -) -parser_a = subparsers.add_parser("train", help="train and test a shic CNN") -parser_d = subparsers.add_parser( - "fvecVcf", help="Generate feature vectors from data in a VCF file" -) -parser_b = subparsers.add_parser( - "predict", help="perform prediction using an already-trained SHIC CNN" -) -# parser_a.add_argument('nDims', metavar='nDims', type=int, -# help='dimensionality of the feature vector') -parser_a.add_argument("trainDir", help="path to training set files") -parser_a.add_argument( - "testDir", help="path to test set files, can be same as trainDir" -) -parser_a.add_argument( - "outputModel", - help="file name for output model, will create two files one with structure one with weights", -) -parser_a.add_argument( - "--epochs", - type=int, - help="max epochs for training CNN (default = 100)", - default=100, -) -parser_a.add_argument( - "--numSubWins", - type=int, - help="number of subwindows that our chromosome is divided into (default = 11)", - default=11, -) -parser_a.add_argument( - "--confusionFile", - help="optional file to which confusion matrix plot will be written (default = None)", - default=None, -) -parser_a.set_defaults(mode="train") -parser_a._positionals.title = "required arguments" - -# parser_b.add_argument('nDims', metavar='nDims', type=int, -# help='dimensionality of the feature vector') -parser_b.add_argument( - "modelStructure", help="path to CNN structure .json file" -) -parser_b.add_argument("modelWeights", help="path to CNN weights .h5 file") -parser_b.add_argument("predictFile", help="input file to predict") -parser_b.add_argument("predictFileOutput", help="output file name") -parser_b.add_argument( - "--numSubWins", - type=int, - help="number of subwindows that our chromosome is divided into (default = 11)", - default=11, -) -parser_b.add_argument( - "--simData", - help="Are we using simulated input data wihout coordinates?", - action="store_true", -) -parser_b.set_defaults(mode="predict") -parser_b._positionals.title = "required arguments" - -parser_c.add_argument( - "shicMode", - help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", - default="diploid", -) -parser_c.add_argument( - "msOutFile", - help="path to simulation output file (must be same format used by Hudson's ms)", -) -parser_c.add_argument( - "fvecFileName", help="path to file where feature vectors will be written" -) -parser_c.add_argument( - "--totalPhysLen", - type=int, - help="Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)", - default=1100000, -) -parser_c.add_argument( - "--numSubWins", - type=int, - help="The number of subwindows that our chromosome will be divided into (default=11)", - default=11, -) -parser_c.add_argument( - "--maskFileName", - help=( - "Path to a fasta-formatted file that contains masking information (marked by 'N'). " - "If specified, simulations will be masked in a manner mirroring windows drawn from this file." - ), - default="None", -) -parser_c.add_argument( - "--vcfForMaskFileName", - help=( - "Path to a VCF file that contains genotype information. This will be used to mask genotypes " - "in a manner that mirrors how the true data are masked." - ), - default=None, -) -parser_c.add_argument( - "--popForMask", - help="The label of the population for which we should draw genotype information from the VCF for masking purposes.", - default=None, -) -parser_c.add_argument( - "--sampleToPopFileName", - help=( - "Path to tab delimited file with population assignments (used for genotype masking); format: " - "SampleID\tpopID" - ), - default="None", -) -parser_c.add_argument( - "--unmaskedGenoFracCutoff", - type=float, - help="Fraction of unmasked genotypes required to retain a site (default=0.75)", - default=0.75, -) -parser_c.add_argument( - "--chrArmsForMasking", - help=( - "A comma-separated list (no spaces) of chromosome arms from which we want to draw masking " - "information (or 'all' if we want to use all arms. Ignored if maskFileName is not specified." - ), - default="all", -) -parser_c.add_argument( - "--unmaskedFracCutoff", - type=float, - help="Minimum fraction of unmasked sites, if masking simulated data (default=0.25)", - default=0.25, -) -parser_c.add_argument( - "--outStatsDir", - help="Path to a directory where values of each statistic in each subwindow are recorded for each rep", - default="None", -) -parser_c.add_argument( - "--ancFileName", - help=( - "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown)." - " This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data." - " Ignored in diploid mode which currently does not use ancestral state information" - ), - default="None", -) -parser_c.add_argument( - "--pMisPol", - type=float, - help="The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)", - default=0.0, -) -parser_c.set_defaults(mode="fvecSim") -parser_c._positionals.title = "required arguments" - -parser_d.add_argument( - "shicMode", - help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", -) -parser_d.add_argument( - "chrArmVcfFile", - help="VCF format file containing data for our chromosome arm (other arms will be ignored)", -) -parser_d.add_argument( - "chrArm", - help="Exact name of the chromosome arm for which feature vectors will be calculated", -) -parser_d.add_argument("chrLen", type=int, help="Length of the chromosome arm") -parser_d.add_argument( - "fvecFileName", help="path to file where feature vectors will be written" -) -parser_d.add_argument( - "--targetPop", - help="Population ID of samples we wish to include", - default="None", -) -parser_d.add_argument( - "--sampleToPopFileName", - help=( - "Path to tab delimited file with population assignments; format: " - "SampleID\tpopID" - ), - default="None", -) -parser_d.add_argument( - "--winSize", - type=int, - help="Length of the large window (default=1100000)", - default=1100000, -) -parser_d.add_argument( - "--numSubWins", - type=int, - help="Number of sub-windows within each large window (default=11)", - default=11, -) -parser_d.add_argument( - "--maskFileName", - help=( - "Path to a fasta-formatted file that contains masking information (marked by 'N'); " - "must have an entry with title matching chrArm" - ), - default="None", -) -parser_d.add_argument( - "--unmaskedFracCutoff", - type=float, - help="Fraction of unmasked sites required to retain a subwindow (default=0.25)", - default=0.25, -) -parser_d.add_argument( - "--unmaskedGenoFracCutoff", - type=float, - help="Fraction of unmasked genotypes required to retain a site (default=0.75)", - default=0.75, -) -parser_d.add_argument( - "--ancFileName", - help=( - "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown); " - "must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral " - "state information." - ), - default="None", -) -parser_d.add_argument( - "--statFileName", - help="Path to a file where statistics will be written for each subwindow that is not filtered out", - default="None", -) -parser_d.add_argument( - "--segmentStart", - help="Left boundary of region in which feature vectors are calculated (whole arm if omitted)", - default="None", -) -parser_d.add_argument( - "--segmentEnd", - help="Right boundary of region in which feature vectors are calculated (whole arm if omitted)", - default="None", -) -parser_d.set_defaults(mode="fvecVcf") -parser_d._positionals.title = "required arguments" - -parser_e.add_argument( - "neutTrainingFileName", help="Path to our neutral feature vectors" -) -parser_e.add_argument( - "softTrainingFilePrefix", - help=( - "Prefix (including higher-level path) of files containing soft training examples" - "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." - ), -) -parser_e.add_argument( - "hardTrainingFilePrefix", - help=( - "Prefix (including higher-level path) of files containing hard training examples" - "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." - ), -) -parser_e.add_argument( - "sweepTrainingWindows", - type=int, - help=( - "comma-separated list of windows to classify as sweeps (usually just '5'" - " but without the quotes)" - ), -) -parser_e.add_argument( - "linkedTrainingWindows", - help=( - "list of windows to treat as linked to sweeps (usually '0,1,2,3,4,6,7,8,9,10' but" - " without the quotes)" - ), -) -parser_e.add_argument( - "outDir", help="path to directory where the training sets will be written" -) -parser_e.set_defaults(mode="makeTrainingSets") -parser_e._positionals.title = "required arguments" -if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) -args = parser.parse_args() -argsDict = vars(args) if argsDict["mode"] in ["train", "predict"]: ########################################################### # Import a bunch of libraries if everything checks out import matplotlib - matplotlib.use("Agg") import numpy as np - import tensorflow as tf - from keras.models import Sequential, Model - from keras import optimizers - from keras.layers import Dense, Dropout, Activation, Flatten, Input - from keras.layers import Conv2D, MaxPooling2D, concatenate + from sklearn.model_selection import train_test_split - from keras.preprocessing.image import ImageDataGenerator + from keras.utils import to_categorical from keras.callbacks import EarlyStopping, ModelCheckpoint - import keras.backend as K - import fnmatch - # nDims = argsDict['nDims'] + from diploshic.network import construct_model + from diploshic.dataloader import DADiploSHICDataLoader, DiploSHICDataLoader, load_fvecs_from_directory, load_empirical_fvecs_from_directory + numSubWins = argsDict["numSubWins"] if argsDict["mode"] == "train": + empiricalDir = argsDict["empiricalDir"] trainingDir = argsDict["trainDir"] testingDir = argsDict["testDir"] epochOption = argsDict["epochs"] outputModel = argsDict["outputModel"] confusionFile = argsDict["confusionFile"] - # nCores = 12 + lossAccFile = argsDict["lossAccFile"] + print("loading data now...") - # training data - hard = np.loadtxt(trainingDir + "hard.fvec", skiprows=1) - nDims = int(hard.shape[1] / numSubWins) - h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins)) - neut = np.loadtxt(trainingDir + "neut.fvec", skiprows=1) - n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins)) - soft = np.loadtxt(trainingDir + "soft.fvec", skiprows=1) - s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins)) - lsoft = np.loadtxt(trainingDir + "linkedSoft.fvec", skiprows=1) - ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins)) - lhard = np.loadtxt(trainingDir + "linkedHard.fvec", skiprows=1) - lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins)) - - both = np.concatenate((h1, n1, s1, ls1, lh1)) - y = np.concatenate( - ( - np.repeat(0, len(h1)), - np.repeat(1, len(n1)), - np.repeat(2, len(s1)), - np.repeat(3, len(ls1)), - np.repeat(4, len(lh1)), - ) - ) - - # reshape both to explicitly set depth image. need for theanno not sure with tensorflow - both = both.reshape(both.shape[0], nDims, numSubWins, 1) + both, y = load_fvecs_from_directory(trainingDir, numSubWins) + Y_test_emp = None if trainingDir == testingDir: - X_train, X_test, y_train, y_test = train_test_split( - both, y, test_size=0.2 - ) + X_train, X_test, y_train, y_test = train_test_split(both, y, test_size=TEST_FRAC) else: X_train = both y_train = y - # testing data - hard = np.loadtxt(testingDir + "hard.fvec", skiprows=1) - h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins)) - neut = np.loadtxt(testingDir + "neut.fvec", skiprows=1) - n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins)) - soft = np.loadtxt(testingDir + "soft.fvec", skiprows=1) - s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins)) - lsoft = np.loadtxt(testingDir + "linkedSoft.fvec", skiprows=1) - ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins)) - lhard = np.loadtxt(testingDir + "linkedHard.fvec", skiprows=1) - lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins)) - - both2 = np.concatenate((h1, n1, s1, ls1, lh1)) - X_test = both2.reshape(both2.shape[0], nDims, numSubWins, 1) - y_test = np.concatenate( - ( - np.repeat(0, len(h1)), - np.repeat(1, len(n1)), - np.repeat(2, len(s1)), - np.repeat(3, len(ls1)), - np.repeat(4, len(lh1)), - ) - ) - - Y_train = tf.keras.utils.to_categorical(y_train, 5) - Y_test = tf.keras.utils.to_categorical(y_test, 5) - X_valid, X_test, Y_valid, Y_test = train_test_split( - X_test, Y_test, test_size=0.5 - ) - - datagen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=True, - ) - - validation_gen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=False, - ) - test_gen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=False, - ) - - # print(X_train.shape) - print("training set has %d examples" % X_train.shape[0]) - print("validation set has %d examples" % X_valid.shape[0]) - print("test set has %d examples" % X_test.shape[0]) - - model_in = Input(X_train.shape[1:]) - h = Conv2D(128, 3, activation="relu", padding="same", name="conv1_1")( - model_in - ) - h = Conv2D(64, 3, activation="relu", padding="same", name="conv1_2")(h) - h = MaxPooling2D(pool_size=3, name="pool1", padding="same")(h) - h = Dropout(0.15, name="drop1")(h) - h = Flatten(name="flaten1")(h) - - dh = Conv2D( - 128, - 2, - activation="relu", - dilation_rate=[1, 3], - padding="same", - name="dconv1_1", - )(model_in) - dh = Conv2D( - 64, - 2, - activation="relu", - dilation_rate=[1, 3], - padding="same", - name="dconv1_2", - )(dh) - dh = MaxPooling2D(pool_size=2, name="dpool1")(dh) - dh = Dropout(0.15, name="ddrop1")(dh) - dh = Flatten(name="dflaten1")(dh) - - dh1 = Conv2D( - 128, - 2, - activation="relu", - dilation_rate=[1, 4], - padding="same", - name="dconv4_1", - )(model_in) - dh1 = Conv2D( - 64, - 2, - activation="relu", - dilation_rate=[1, 4], - padding="same", - name="dconv4_2", - )(dh1) - dh1 = MaxPooling2D(pool_size=2, name="d1pool1")(dh1) - dh1 = Dropout(0.15, name="d1drop1")(dh1) - dh1 = Flatten(name="d1flaten1")(dh1) + X_test, y_test = load_fvecs_from_directory(testingDir, numSubWins) + if argsDict["domain_adaptation"]: + if empiricalDir is None: + # Load empirical data from train/test directory labeled empirical.fvec (NOT FOR SIM DATA -- FOR REAL DATA) + emp = load_empirical_fvecs_from_directory(trainingDir, numSubWins) + if trainingDir == testingDir: + X_train_emp, X_test_emp = train_test_split(emp, test_size=TEST_FRAC) + else: + X_train_emp = emp + emp = load_empirical_fvecs_from_directory(testingDir, numSubWins) + else: + # Load a directory of simulation data to use as "empirical data" and domain adaptation + X_emp, y_emp = load_fvecs_from_directory(empiricalDir, numSubWins) + X_train_emp, X_test_emp, y_train_emp, y_test_emp = train_test_split(X_emp, y_emp, test_size=TEST_FRAC) + Y_test_emp = to_categorical(y_test_emp, 5) + Y_train_emp = to_categorical(y_train_emp, 5) + + Y_train = to_categorical(y_train, 5) + Y_test = to_categorical(y_test, 5) + X_valid, X_train, Y_valid, Y_train = train_test_split(X_train, Y_train, test_size=VALID_FRAC) + if argsDict["domain_adaptation"]: + if Y_train_emp is not None: + X_valid_emp, X_train_emp, Y_valid_emp, Y_train_emp = train_test_split(X_train_emp, Y_train_emp, test_size=VALID_FRAC) + else: + X_valid_emp, X_train_emp = train_test_split(X_train_emp, test_size=VALID_FRAC) + datagen = DADiploSHICDataLoader(X_train, X_train_emp, Y_train, batch_size=BATCH_SIZE) + validation_gen = DADiploSHICDataLoader(X_valid, X_valid_emp, Y_valid, batch_size=BATCH_SIZE) + test_gen = DADiploSHICDataLoader(X_test, X_test_emp, Y_test, batch_size=BATCH_SIZE) + test_emp_gen = DADiploSHICDataLoader(X_test_emp, X_test_emp, Y_test_emp, batch_size=BATCH_SIZE) + else: + datagen = DiploSHICDataLoader(X_train, Y_train, batch_size=BATCH_SIZE) + validation_gen = DiploSHICDataLoader(X_valid, Y_valid, batch_size=BATCH_SIZE) + test_gen = DiploSHICDataLoader(X_test, Y_test, batch_size=BATCH_SIZE) - h = concatenate([h, dh, dh1]) - h = Dense(512, name="512dense", activation="relu")(h) - h = Dropout(0.2, name="drop7")(h) - h = Dense(128, name="last_dense", activation="relu")(h) - h = Dropout(0.1, name="drop8")(h) - output = Dense(5, name="out_dense", activation="softmax")(h) - model = Model(inputs=[model_in], outputs=[output]) + print("training set has %d examples (%d batches of %d)" % (X_train.shape[0], X_train.shape[0] / BATCH_SIZE, BATCH_SIZE)) + print("validation set has %d examples (%d batches of %d)" % (X_valid.shape[0], X_valid.shape[0] / BATCH_SIZE, BATCH_SIZE)) + print("test set has %d examples (%d batches of %d)" % (X_test.shape[0], X_test.shape[0] / BATCH_SIZE, BATCH_SIZE)) + if argsDict["domain_adaptation"]: + print("empirical training set has %d examples (%d batches of %d)" % (X_train_emp.shape[0], X_train_emp.shape[0] / BATCH_SIZE, BATCH_SIZE)) + print("empirical validation set has %d examples (%d batches of %d)" % (X_train_emp.shape[0], X_train_emp.shape[0] / BATCH_SIZE, BATCH_SIZE)) - model.compile( - loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"] - ) + model = construct_model(X_train.shape[1:], domain_adaptation=argsDict["domain_adaptation"], da_weight=argsDict["da_weight"], pred_weight=argsDict["pred_weight"]) - # define early stopping callback + model_json = model.to_json() + with open(outputModel + ".json", "w") as json_file: + json_file.write(model_json) + modWeightsFilepath = outputModel + ".weights.hdf5" + + monitor = "val_predictor_masked_categorical_accuracy" if argsDict["domain_adaptation"] else "val_masked_categorical_accuracy" earlystop = EarlyStopping( - monitor="val_accuracy", + monitor=monitor, min_delta=0.001, - patience=5, + patience=3, verbose=1, mode="auto", ) - - model_json = model.to_json() - with open(outputModel + ".json", "w") as json_file: - json_file.write(model_json) - modWeightsFilepath = outputModel + ".weights.hdf5" checkpoint = ModelCheckpoint( modWeightsFilepath, - monitor="val_accuracy", + monitor=monitor, verbose=1, save_best_only=True, save_weights_only=True, mode="auto", ) - + # callbacks_list = [checkpoint] callbacks_list = [earlystop, checkpoint] - # callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment - - datagen.fit(X_train) - validation_gen.fit(X_valid) - test_gen.fit(X_test) start = time.time() - model.fit( - datagen.flow(X_train, Y_train, batch_size=32), - steps_per_epoch=len(X_train) / 32, - epochs=epochOption, - verbose=1, - callbacks=callbacks_list, - validation_data=validation_gen.flow(X_valid, Y_valid, batch_size=32), - validation_steps=len(X_test) / 32, - ) - # model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1) - score = model.evaluate( - test_gen.flow(X_test, Y_test, batch_size=32), steps=len(Y_test) / 32 - ) + + if argsDict["domain_adaptation"]: + training_history = model.fit( + datagen, + steps_per_epoch=len(X_train) / BATCH_SIZE, + epochs=epochOption, + verbose=1, + callbacks=callbacks_list, + validation_data=validation_gen, + validation_steps=len(X_valid) / BATCH_SIZE, + ) + score = model.evaluate( + test_gen, + steps=len(Y_test) / BATCH_SIZE, + return_dict=True) + #predictions = model.predict(X_test) + #preds = np.argmax(predictions, axis=1) + score_emp = model.evaluate(test_emp_gen, steps = len(Y_test_emp) / BATCH_SIZE, return_dict=True) + #emppredictions = model.predict(X_test_emp) + #emppreds = np.argmax(emppredictions, axis=1) + else: + training_history = model.fit( + datagen, + steps_per_epoch=len(X_train) / BATCH_SIZE, + epochs=epochOption, + verbose=1, + callbacks=callbacks_list, + validation_data=validation_gen, + validation_steps=len(X_valid) / BATCH_SIZE, + ) + score = model.evaluate( + test_gen, + steps=len(Y_test) / BATCH_SIZE, + return_dict=True + ) + #predictions = model.predict(X_test) + #preds = np.argmax(predictions, axis=1) + if empiricalDir is not None: + X_emp, y_emp = load_fvecs_from_directory(empiricalDir, numSubWins) + X_train_emp, X_test_emp, y_train_emp, y_test_emp = train_test_split(X_emp, y_emp, test_size=TEST_FRAC) + Y_test_emp = to_categorical(y_test_emp,5) + emp_test_gen = DiploSHICDataLoader(X_test_emp, Y_test_emp, batch_size=BATCH_SIZE) + score_emp = model.evaluate(emp_test_gen, + steps = len(Y_test_emp) / BATCH_SIZE, + return_dict=True) + #emppredictions = model.predict(X_test_emp) + #emppreds = np.argmax(emppredictions, axis=1) + sys.stderr.write( "total time spent fitting and evaluating: %f secs\n" % (time.time() - start) ) - - print("evaluation on test set:") - print("diploSHIC loss: %f" % score[0]) - print("diploSHIC accuracy: %f" % score[1]) + #np.savez(outputModel + "_preds.npz", pred=preds, true=y_test) + #if empiricalDir is not None: + # np.savez(outputModel + "_emp_preds.npz", pred=emppreds, true=y_test_emp) + with open(outputModel + "_training_history.json", "w") as json_file: + metadata = {'history': training_history.history, 'score': score} + if empiricalDir is not None: + metadata['score_emp'] = score_emp + json.dump(metadata, json_file) + + if argsDict["domain_adaptation"]: + print("evaluation on source test set:") + print(f"diploSHIC loss: Predictor: {score['predictor_loss']:3f}, Discriminator: {score['discriminator_loss']:3f}") + print(f"diploSHIC accuracy: Predictor: {score['predictor_masked_categorical_accuracy']:3f}, Discriminator: {score['discriminator_masked_binary_accuracy']:3f}") + print("evaluation on target test set:") + print(f"diploSHIC loss: Predictor: {score_emp['predictor_loss']:3f}, Discriminator: {score_emp['discriminator_loss']:3f}") + print(f"diploSHIC accuracy: Predictor: {score_emp['predictor_masked_categorical_accuracy']:3f}, Discriminator: {score_emp['discriminator_masked_binary_accuracy']:3f}") + else: + print("evaluation on source test set:") + print(f"diploSHIC loss: {score['loss']:3f}") + print(f"diploSHIC accuracy: {score['masked_categorical_accuracy']:3f}") + if empiricalDir is not None: + print("evaluation on target test set:") + print(f"diploSHIC loss: {score_emp['loss']:3f}") + print(f"diploSHIC accuracy: {score_emp['masked_categorical_accuracy']:3f}") + + if lossAccFile: + from diploshic.misc import plot_training_metrics + import matplotlib.pyplot as plt + fig = plot_training_metrics(training_history.history) + plt.savefig(lossAccFile, bbox_inches="tight") + if confusionFile: - from misc import plot_confusion_matrix + from diploshic.misc import plot_confusion_matrix import matplotlib.pyplot as plt - plot_confusion_matrix( model, - test_gen.standardize(X_test), + X_test, Y_test, labels=[0, 4, 2, 3, 1], display_labels=[ @@ -551,13 +219,20 @@ if argsDict["mode"] == "train": ], cmap=plt.cm.Blues, normalize="true", + domain_adaptation=argsDict["domain_adaptation"], ) plt.savefig(confusionFile, bbox_inches="tight") + if empiricalDir is not None: + plot_confusion_matrix(model, X_test_emp, Y_test_emp, labels=[0, 4, 2, 3, 1], + display_labels=["Hard", "Hard-linked", "Soft", "Soft-linked", "Neutral"], cmap=plt.cm.Blues, normalize="true", + domain_adaptation=argsDict["domain_adaptation"]) + plt.savefig(confusionFile.replace(".png", "_empirical.png"), bbox_inches="tight") elif argsDict["mode"] == "predict": - import pandas as pd from keras.models import model_from_json - + from keras.preprocessing.image import ImageDataGenerator + import pandas as pd + # import data from predictFile x_df = pd.read_table(argsDict["predictFile"]) if argsDict["simData"]: @@ -565,10 +240,7 @@ elif argsDict["mode"] == "predict": else: testX = x_df[list(x_df)[4:]].to_numpy() nDims = int(testX.shape[1] / numSubWins) - np.reshape(testX, (testX.shape[0], nDims, numSubWins)) - # add channels testX = testX.reshape(testX.shape[0], nDims, numSubWins, 1) - # set up generator for normalization validation_gen = ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True, @@ -736,7 +408,6 @@ elif argsDict["mode"] == "fvecVcf": if argsDict["segmentStart"] != "None": additionalArgs += [argsDict["segmentStart"], argsDict["segmentEnd"]] cmd += " " + " ".join(additionalArgs) - # cmd += " > " + argsDict['fvecFileName'] print(cmd) subprocess.call(cmd.split()) elif argsDict["mode"] == "makeTrainingSets": diff --git a/diploshic/misc.py b/diploshic/misc.py index 79dc084..bb44d63 100644 --- a/diploshic/misc.py +++ b/diploshic/misc.py @@ -1,5 +1,6 @@ import numpy as np from scipy.sparse import coo_matrix +from keras.metrics import categorical_accuracy """This is all a bunch of stuff copied from sk-learn 0.24.2 but shoving it in here for compatibility purposes. Some slight modifications were made.""" @@ -68,7 +69,8 @@ def plot( xticks_rotation="horizontal", values_format=None, ax=None, - colorbar=True + colorbar=True, + title="", ): """Plot visualization. Parameters @@ -145,7 +147,7 @@ def plot( ax.set_ylim((n_classes - 0.5, -0.5)) plt.setp(ax.get_xticklabels(), rotation=xticks_rotation) - + ax.set_title(title) self.figure_ = fig self.ax_ = ax return self @@ -295,7 +297,8 @@ def plot_confusion_matrix( values_format=None, cmap="viridis", ax=None, - colorbar=True + colorbar=True, + domain_adaptation=False ): """Plot Confusion Matrix. Read more in the :ref:`User Guide `. @@ -361,7 +364,8 @@ def plot_confusion_matrix( >>> plt.show() # doctest: +SKIP """ - y_pred = estimator.predict(X) + y_pred = estimator.predict(X)[0] if domain_adaptation else estimator.predict(X) + acc = categorical_accuracy(y_true, y_pred) cm = confusion_matrix( y_true, y_pred, @@ -379,4 +383,28 @@ def plot_confusion_matrix( xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar, + title=f'Overall Accuracy: {acc*100:.2f}%' ) + + +def plot_training_metrics(hist, t='Training Metrics'): + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(1, 2, figsize=(12, 4)) + ax[0].plot(hist['loss'], label=f'loss', color='tab:blue', lw=2, alpha=0.8) + ax[0].plot(hist['val_loss'], label=f'val loss', color='tab:blue', lw=2, alpha=0.8, ls='--') + ax[0].axvline(np.arange(len(hist['loss']))[np.argmax(hist['val_acc'])], color='tab:orange', ls=':', lw=2, label='Best Model') + + ax[1].plot(hist['acc'], label=f'acc', color='tab:blue', lw=2, alpha=0.8) + ax[1].plot(hist['val_acc'], label=f'val acc', color='tab:blue', lw=2, alpha=0.8, ls='--') + ax[1].axvline(np.arange(len(hist['loss']))[np.argmax(hist['val_acc'])], color='tab:orange', ls=':', lw=2, label='Best Model') + + ax[0].set_xlabel('Epoch') + ax[0].set_ylabel('Loss') + ax[0].legend() + ax[1].set_xlabel('Epoch') + ax[1].set_ylabel('Accuracy (%)') + ax[1].legend() + t += f"SRC Test Acc: {hist['score']:.1f}% TGT Test Acc: {hist['score_emp']:.1f}%" + plt.suptitle(t) + return fig \ No newline at end of file diff --git a/diploshic/network.py b/diploshic/network.py new file mode 100644 index 0000000..478d981 --- /dev/null +++ b/diploshic/network.py @@ -0,0 +1,119 @@ +from tensorflow import custom_gradient, identity, boolean_mask, not_equal, reduce_all +from keras.models import Model +from keras.layers import Dense, Dropout, Flatten, Input, BatchNormalization +from keras.layers import Conv2D, MaxPooling2D, concatenate, Layer, Activation +from keras.metrics import binary_crossentropy, categorical_crossentropy, categorical_accuracy, binary_accuracy + + +@custom_gradient +def grad_reverse(x): + y = identity(x) + def custom_grad(dy): + return -dy + return y, custom_grad + +class GradReverse(Layer): + def __init__(self): + super().__init__() + + def call(self, x): + return grad_reverse(x) + +def masked_bce(y_true, y_pred): + mask = not_equal(y_true, -1) + y_pred = boolean_mask(y_pred, mask) + y_true = boolean_mask(y_true, mask) + return binary_crossentropy(y_true, y_pred) + +def masked_cce(y_true, y_pred): + mask = reduce_all(not_equal(y_true, -1),1) + y_pred = boolean_mask(y_pred, mask) + y_true = boolean_mask(y_true, mask) + return categorical_crossentropy(y_true, y_pred) + +def masked_categorical_accuracy(y_true, y_pred): + mask = reduce_all(not_equal(y_true, -1),1) + y_true = boolean_mask(y_true, mask) + y_pred = boolean_mask(y_pred, mask) + return categorical_accuracy(y_true, y_pred) + +def masked_binary_accuracy(y_true, y_pred): + mask = not_equal(y_true, -1) + y_pred = boolean_mask(y_pred, mask) + y_true = boolean_mask(y_true, mask) + return binary_accuracy(y_true, y_pred) + +def construct_model(input_shape, domain_adaptation=False, da_weight=1, pred_weight=1): + model_in = Input(input_shape) + h = Conv2D(128, 3, activation="relu", padding="same", name="conv1_1")( + model_in + ) + h = Conv2D(64, 3, activation="relu", padding="same", name="conv1_2")(h) + h = MaxPooling2D(pool_size=3, name="pool1", padding="same")(h) + h = Dropout(0.15, name="drop1")(h) + h = Flatten(name="flaten1")(h) + + dh = Conv2D( + 128, + 2, + activation="relu", + dilation_rate=[1, 3], + padding="same", + name="dconv1_1", + )(model_in) + dh = Conv2D( + 64, + 2, + activation="relu", + dilation_rate=[1, 3], + padding="same", + name="dconv1_2", + )(dh) + dh = MaxPooling2D(pool_size=2, name="dpool1")(dh) + dh = Dropout(0.15, name="ddrop1")(dh) + dh = Flatten(name="dflaten1")(dh) + + dh1 = Conv2D( + 128, + 2, + activation="relu", + dilation_rate=[1, 4], + padding="same", + name="dconv4_1", + )(model_in) + dh1 = Conv2D( + 64, + 2, + activation="relu", + dilation_rate=[1, 4], + padding="same", + name="dconv4_2", + )(dh1) + dh1 = MaxPooling2D(pool_size=2, name="d1pool1")(dh1) + dh1 = Dropout(0.15, name="d1drop1")(dh1) + dh1 = Flatten(name="d1flaten1")(dh1) + + h_concated = concatenate([h, dh, dh1]) + h = Dense(512, name="512dense", use_bias=False)(h_concated) + h = BatchNormalization(name="512norm")(h) + h = Activation("relu", name="512relu")(h) + h = Dense(256, name="256dense", activation="relu")(h) + h = Dense(128, name="128dense", activation="relu")(h) + output = Dense(5, name="predictor", activation="softmax")(h) + if domain_adaptation: + da = GradReverse()(h_concated) + da = Dense(512, name="DA512dense", use_bias=False)(da) + da = BatchNormalization(name="DA512norm")(da) + da = Activation("relu", name="DA512relu")(da) + da = Dense(256, name="DA256dense", activation="relu")(da) + da = Dense(128, name="DA128dense", activation="relu")(da) + domain_output = Dense(1, name="discriminator", activation="sigmoid")(da) + model = Model(inputs=[model_in], outputs=[output, domain_output]) + model.compile(optimizer='adam', + loss={'predictor': masked_cce, 'discriminator': masked_bce}, + loss_weights = [pred_weight, da_weight], + metrics={'predictor': masked_categorical_accuracy, 'discriminator': masked_binary_accuracy}) + else: + model = Model(inputs=[model_in], outputs=[output]) + model.compile(loss=masked_cce, optimizer="adam", metrics=[masked_categorical_accuracy]) + return model diff --git a/diploshic/parser.py b/diploshic/parser.py new file mode 100644 index 0000000..992e28d --- /dev/null +++ b/diploshic/parser.py @@ -0,0 +1,326 @@ +import argparse, sys + +def load_arg_dict(): + ################# use argparse to get CL args and make sure we are kosher + parser = argparse.ArgumentParser( + description="calculate feature vectors, train, or predict with diploSHIC" + ) + parser._positionals.title = "possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message" + subparsers = parser.add_subparsers(help="sub-command help") + parser_c = subparsers.add_parser( + "fvecSim", help="Generate feature vectors from simulated data" + ) + parser_e = subparsers.add_parser( + "makeTrainingSets", + help="Combine feature vectors from muliple fvecSim runs into 5 balanced training sets", + ) + parser_a = subparsers.add_parser("train", help="train and test a shic CNN") + parser_d = subparsers.add_parser( + "fvecVcf", help="Generate feature vectors from data in a VCF file" + ) + parser_b = subparsers.add_parser( + "predict", help="perform prediction using an already-trained SHIC CNN" + ) + parser_a.add_argument("trainDir", help="path to training set files") + parser_a.add_argument( + "testDir", help="path to test set files, can be same as trainDir" + ) + parser_a.add_argument( + "outputModel", + help="file name for output model, will create two files one with structure one with weights", + ) + parser_a.add_argument( + "--epochs", + type=int, + help="max epochs for training CNN (default = 100)", + default=50, + ) + parser_a.add_argument( + "--domain-adaptation", + action='store_true', + help="Optional Flag to run model with Domain Adaptation", + default=False, + ) + parser_a.add_argument( + "--da-weight", + type=float, + help="Relative weight of DA Discriminator loss: Predictor loss", + default=1, + ) + parser_a.add_argument( + "--pred-weight", + type=float, + help="Relative weight of Predictor loss: DA Discriminator loss", + default=1, + ) + parser_a.add_argument( + "--empiricalDir", + type=str, + help="Path to directory containing empirical data for domain adaptation", + default = None, + ) + parser_a.add_argument( + "--numSubWins", + type=int, + help="number of subwindows that our chromosome is divided into (default = 11)", + default=11, + ) + parser_a.add_argument( + "--confusionFile", + help="optional file to which confusion matrix plot will be written (default = None)", + default=None, + ) + parser_a.add_argument( + "--lossAccFile", + help="optional file to which the model training metrics plot will be written (default = None)", + default=None, + ) + parser_a.set_defaults(mode="train") + parser_a._positionals.title = "required arguments" + parser_b.add_argument( + "modelStructure", help="path to CNN structure .json file" + ) + parser_b.add_argument("modelWeights", help="path to CNN weights .h5 file") + parser_b.add_argument("predictFile", help="input file to predict") + parser_b.add_argument("predictFileOutput", help="output file name") + parser_b.add_argument( + "--numSubWins", + type=int, + help="number of subwindows that our chromosome is divided into (default = 11)", + default=11, + ) + parser_b.add_argument( + "--simData", + help="Are we using simulated input data wihout coordinates?", + action="store_true", + ) + parser_b.set_defaults(mode="predict") + parser_b._positionals.title = "required arguments" + + parser_c.add_argument( + "shicMode", + help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", + default="diploid", + ) + parser_c.add_argument( + "msOutFile", + help="path to simulation output file (must be same format used by Hudson's ms)", + ) + parser_c.add_argument( + "fvecFileName", help="path to file where feature vectors will be written" + ) + parser_c.add_argument( + "--totalPhysLen", + type=int, + help="Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)", + default=1100000, + ) + parser_c.add_argument( + "--numSubWins", + type=int, + help="The number of subwindows that our chromosome will be divided into (default=11)", + default=11, + ) + parser_c.add_argument( + "--maskFileName", + help=( + "Path to a fasta-formatted file that contains masking information (marked by 'N'). " + "If specified, simulations will be masked in a manner mirroring windows drawn from this file." + ), + default="None", + ) + parser_c.add_argument( + "--vcfForMaskFileName", + help=( + "Path to a VCF file that contains genotype information. This will be used to mask genotypes " + "in a manner that mirrors how the true data are masked." + ), + default=None, + ) + parser_c.add_argument( + "--popForMask", + help="The label of the population for which we should draw genotype information from the VCF for masking purposes.", + default=None, + ) + parser_c.add_argument( + "--sampleToPopFileName", + help=( + "Path to tab delimited file with population assignments (used for genotype masking); format: " + "SampleID\tpopID" + ), + default="None", + ) + parser_c.add_argument( + "--unmaskedGenoFracCutoff", + type=float, + help="Fraction of unmasked genotypes required to retain a site (default=0.75)", + default=0.75, + ) + parser_c.add_argument( + "--chrArmsForMasking", + help=( + "A comma-separated list (no spaces) of chromosome arms from which we want to draw masking " + "information (or 'all' if we want to use all arms. Ignored if maskFileName is not specified." + ), + default="all", + ) + parser_c.add_argument( + "--unmaskedFracCutoff", + type=float, + help="Minimum fraction of unmasked sites, if masking simulated data (default=0.25)", + default=0.25, + ) + parser_c.add_argument( + "--outStatsDir", + help="Path to a directory where values of each statistic in each subwindow are recorded for each rep", + default="None", + ) + parser_c.add_argument( + "--ancFileName", + help=( + "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown)." + " This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data." + " Ignored in diploid mode which currently does not use ancestral state information" + ), + default="None", + ) + parser_c.add_argument( + "--pMisPol", + type=float, + help="The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)", + default=0.0, + ) + parser_c.set_defaults(mode="fvecSim") + parser_c._positionals.title = "required arguments" + + parser_d.add_argument( + "shicMode", + help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", + ) + parser_d.add_argument( + "chrArmVcfFile", + help="VCF format file containing data for our chromosome arm (other arms will be ignored)", + ) + parser_d.add_argument( + "chrArm", + help="Exact name of the chromosome arm for which feature vectors will be calculated", + ) + parser_d.add_argument("chrLen", type=int, help="Length of the chromosome arm") + parser_d.add_argument( + "fvecFileName", help="path to file where feature vectors will be written" + ) + parser_d.add_argument( + "--targetPop", + help="Population ID of samples we wish to include", + default="None", + ) + parser_d.add_argument( + "--sampleToPopFileName", + help=( + "Path to tab delimited file with population assignments; format: " + "SampleID\tpopID" + ), + default="None", + ) + parser_d.add_argument( + "--winSize", + type=int, + help="Length of the large window (default=1100000)", + default=1100000, + ) + parser_d.add_argument( + "--numSubWins", + type=int, + help="Number of sub-windows within each large window (default=11)", + default=11, + ) + parser_d.add_argument( + "--maskFileName", + help=( + "Path to a fasta-formatted file that contains masking information (marked by 'N'); " + "must have an entry with title matching chrArm" + ), + default="None", + ) + parser_d.add_argument( + "--unmaskedFracCutoff", + type=float, + help="Fraction of unmasked sites required to retain a subwindow (default=0.25)", + default=0.25, + ) + parser_d.add_argument( + "--unmaskedGenoFracCutoff", + type=float, + help="Fraction of unmasked genotypes required to retain a site (default=0.75)", + default=0.75, + ) + parser_d.add_argument( + "--ancFileName", + help=( + "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown); " + "must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral " + "state information." + ), + default="None", + ) + parser_d.add_argument( + "--statFileName", + help="Path to a file where statistics will be written for each subwindow that is not filtered out", + default="None", + ) + parser_d.add_argument( + "--segmentStart", + help="Left boundary of region in which feature vectors are calculated (whole arm if omitted)", + default="None", + ) + parser_d.add_argument( + "--segmentEnd", + help="Right boundary of region in which feature vectors are calculated (whole arm if omitted)", + default="None", + ) + parser_d.set_defaults(mode="fvecVcf") + parser_d._positionals.title = "required arguments" + + parser_e.add_argument( + "neutTrainingFileName", help="Path to our neutral feature vectors" + ) + parser_e.add_argument( + "softTrainingFilePrefix", + help=( + "Prefix (including higher-level path) of files containing soft training examples" + "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." + ), + ) + parser_e.add_argument( + "hardTrainingFilePrefix", + help=( + "Prefix (including higher-level path) of files containing hard training examples" + "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." + ), + ) + parser_e.add_argument( + "sweepTrainingWindows", + type=int, + help=( + "comma-separated list of windows to classify as sweeps (usually just '5'" + " but without the quotes)" + ), + ) + parser_e.add_argument( + "linkedTrainingWindows", + help=( + "list of windows to treat as linked to sweeps (usually '0,1,2,3,4,6,7,8,9,10' but" + " without the quotes)" + ), + ) + parser_e.add_argument( + "outDir", help="path to directory where the training sets will be written" + ) + parser_e.set_defaults(mode="makeTrainingSets") + parser_e._positionals.title = "required arguments" + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + args = parser.parse_args() + argsDict = vars(args) + return argsDict \ No newline at end of file diff --git a/meta.yaml b/meta.yaml index 4f2385f..cec6b88 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,6 +1,6 @@ package: name: diploshic - version: "0.4" + version: "1.1.0" source: git_url: https://github.com/kr-colab/diploshic diff --git a/setup.py b/setup.py index dae6c2d..173caf4 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ "diploshic/utils.c"], ) setup(name='diploSHIC', - version='1.0.4', + version='1.1.0', description='diploSHIC', long_description=long_description, long_description_content_type="text/markdown",