diff --git a/.travis.yml b/.travis.yml index 8853e9c..4b271a4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,10 +6,10 @@ notifications: language: python python: - - "2.7" + - "3.5" before_install: - - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh + - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - conda update --yes conda diff --git a/README.rst b/README.rst index 4412cc9..43516d0 100644 --- a/README.rst +++ b/README.rst @@ -23,21 +23,28 @@ pyGeno: A Python package for precision medicine and proteogenomics pyGeno is (to our knowledge) the only tool available that will gladly build your specific genomes for you. -pyGeno is developed by `Tariq Daouda`_ at the *Institute for Research in Immunology and Cancer* (IRIC_), its logo is the work of the freelance designer `Sawssan Kaddoura`_. +pyGeno was developed by `Tariq Daouda`_ at the *Institute for Research in Immunology and Cancer* (IRIC_), its logo is the work of the freelance designer `Sawssan Kaddoura`_. For the latest news about pyGeno, you can follow me on twitter `@tariqdaouda`_. .. _Tariq Daouda: http://wwww.tariqdaouda.com .. _IRIC: http://www.iric.ca .. _Sawssan Kaddoura: http://sawssankaddoura.com -Click here for The `full documentation`_. +Click here for the `full documentation`_, or here for a `notebook`_ that shows how *simple* can equate with *efficient*. .. _full documentation: http://pygeno.iric.ca/ +.. _notebook: pyGeno/examples/genomic_graph.ipynb For the latest news about pyGeno, you can follow me on twitter `@tariqdaouda`_. .. _@tariqdaouda: https://www.twitter.com/tariqdaouda +Contributions: +-------------- + +pyGeno was started by Tariq Daouda with contributions from: Albert Feghaly, Éric Audemard, Jean-Philippe Laverdure. +If you would like to contribute, please get in touch. + Citing pyGeno: -------------- Please cite this paper_. @@ -51,21 +58,21 @@ It is recommended to install pyGeno within a `virtual environement`_, to setup o .. code:: shell - virtualenv ~/.pyGenoEnv + python venv ~/.pyGenoEnv source ~/.pyGenoEnv/bin/activate pyGeno can be installed through pip: .. code:: shell - - pip install pyGeno #for the latest stable version + + pip install pyGeno #for the latest stable version Or github, for the latest developments: .. code:: shell - git clone https://github.com/tariqdaouda/pyGeno.git - cd pyGeno + git clone https://github.com/tariqdaouda/pyGeno.git + cd pyGeno python setup.py develop .. _`virtual environement`: http://virtualenv.readthedocs.org/ @@ -83,43 +90,44 @@ direct access to the DNA and Protein sequences of your patients. .. code:: python - from pyGeno.Genome import * - - g = Genome(name = "GRCh37.75") - prot = g.get(Protein, id = 'ENSP00000438917')[0] - #print the protein sequence - print prot.sequence - #print the protein's gene biotype - print prot.gene.biotype - #print protein's transcript sequence - print prot.transcript.sequence - - #fancy queries - for exon in g.get(Exon, {"CDS_start >": x1, "CDS_end <=" : x2, "chromosome.number" : "22"}) : - #print the exon's coding sequence - print exon.CDS - #print the exon's transcript sequence - print exon.transcript.sequence - - #You can do the same for your subject specific genomes - #by combining a reference genome with polymorphisms - g = Genome(name = "GRCh37.75", SNPs = ["STY21_RNA"], SNPFilter = MyFilter()) + from pyGeno.Genome import * + + # In case the Genome cannot be found (i.e., KeyError), please consider the below section on bootstrapping + g = Genome(name = "GRCh37.75") + prot = g.get(Protein, id = 'ENSP00000438917')[0] + #print the protein sequence + print(prot.sequence) + #print the protein's gene biotype + print(prot.gene.biotype) + #print protein's transcript sequence + print(prot.transcript.sequence) + + #fancy queries + for exon in g.get(Exon, {"CDS_start >": x1, "CDS_end <=" : x2, "chromosome.number" : "22"}) : + #print the exon's coding sequence + print(exon.CDS) + #print the exon's transcript sequence + print(exon.transcript.sequence) + + #You can do the same for your subject specific genomes + #by combining a reference genome with polymorphisms + g = Genome(name = "GRCh37.75", SNPs = ["STY21_RNA"], SNPFilter = MyFilter()) And if you ever get lost, there's an online **help()** function for each object type: .. code:: python - from pyGeno.Genome import * - - print Exon.help() + from pyGeno.Genome import * + + print(Exon.help()) Should output: .. code:: - - Available fields for Exon: CDS_start, end, chromosome, CDS_length, frame, number, CDS_end, start, genome, length, protein, gene, transcript, id, strand + + Available fields for Exon: CDS_start, end, chromosome, CDS_length, frame, number, CDS_end, start, genome, length, protein, gene, transcript, id, strand - + Creating a Personalized Genome: ------------------------------- Personalized Genomes are a powerful feature that allow you to work on the specific genomes and proteomes of your patients. You can even mix several SNP sets together. @@ -139,70 +147,70 @@ Filtering SNPs: pyGeno allows you to select the Polymorphisms that end up into the final sequences. It supports SNPs, Inserts and Deletions. .. code:: python - - from pyGeno.SNPFiltering import SNPFilter, SequenceSNP - - class QMax_gt_filter(SNPFilter) : - - def __init__(self, threshold) : - self.threshold = threshold - - #Here SNPs is a dictionary: SNPSet Name => polymorphism - #This filter ignores deletions and insertions and - #but applis all SNPs - def filter(self, chromosome, **SNPs) : - sources = {} - alleles = [] - for snpSet, snp in SNPs.iteritems() : - pos = snp.start - if snp.alt[0] == '-' : - pass - elif snp.ref[0] == '-' : - pass - else : - sources[snpSet] = snp - alleles.append(snp.alt) #if not an indel append the polymorphism - - #appends the refence allele to the lot - refAllele = chromosome.refSequence[pos] - alleles.append(refAllele) - sources['ref'] = refAllele - - #optional we keep a record of the polymorphisms that were used during the process - return SequenceSNP(alleles, sources = sources) - + + from pyGeno.SNPFiltering import SNPFilter, SequenceSNP + + class QMax_gt_filter(SNPFilter) : + + def __init__(self, threshold) : + self.threshold = threshold + + #Here SNPs is a dictionary: SNPSet Name => polymorphism + #This filter ignores deletions and insertions and + #but applis all SNPs + def filter(self, chromosome, **SNPs) : + sources = {} + alleles = [] + for snpSet, snp in SNPs.iteritems() : + pos = snp.start + if snp.alt[0] == '-' : + pass + elif snp.ref[0] == '-' : + pass + else : + sources[snpSet] = snp + alleles.append(snp.alt) #if not an indel append the polymorphism + + #appends the refence allele to the lot + refAllele = chromosome.refSequence[pos] + alleles.append(refAllele) + sources['ref'] = refAllele + + #optional we keep a record of the polymorphisms that were used during the process + return SequenceSNP(alleles, sources = sources) + The filter function can also be made more specific by using arguments that have the same names as the SNPSets .. code:: python - def filter(self, chromosome, dummySRY = None) : - if dummySRY.Qmax_gt > self.threshold : - #other possibilities of return are SequenceInsert(), SequenceDelete() - return SequenceSNP(dummySRY.alt) - return None #None means keep the reference allele + def filter(self, chromosome, dummySRY = None) : + if dummySRY.Qmax_gt > self.threshold : + #other possibilities of return are SequenceInsert(), SequenceDelete() + return SequenceSNP(dummySRY.alt) + return None #None means keep the reference allele To apply the filter simply specify if while loading the genome. .. code:: python - persGenome = Genome(name = 'GRCh37.75_Y-Only', SNPs = 'dummySRY', SNPFilter = QMax_gt_filter(10)) + persGenome = Genome(name = 'GRCh37.75_Y-Only', SNPs = 'dummySRY', SNPFilter = QMax_gt_filter(10)) To include several SNPSets use a list. .. code:: python - persGenome = Genome(name = 'GRCh37.75_Y-Only', SNPs = ['ARN_P1', 'ARN_P2'], SNPFilter = myFilter()) + persGenome = Genome(name = 'GRCh37.75_Y-Only', SNPs = ['ARN_P1', 'ARN_P2'], SNPFilter = myFilter()) Getting an arbitrary sequence: ------------------------------ You can ask for any sequence of any chromosome: .. code:: python - - chr12 = myGenome.get(Chromosome, number = "12")[0] - print chr12.sequence[x1:x2] - # for the reference sequence - print chr12.refSequence[x1:x2] + + chr12 = myGenome.get(Chromosome, number = "12")[0] + print(chr12.sequence[x1:x2]) + # for the reference sequence + # print(chr12.refSequence[x1:x2]) Batteries included (bootstraping): --------------------------------- @@ -211,33 +219,33 @@ pyGeno's database is populated by importing datawraps. pyGeno comes with a few data wraps, to get the list you can use: .. code:: python - - import pyGeno.bootstrap as B - B.printDatawraps() + + import pyGeno.bootstrap as B + B.printDatawraps() .. code:: - Available datawraps for boostraping - - SNPs - ~~~~| - |~~~:> Human_agnostic.dummySRY.tar.gz - |~~~:> Human.dummySRY_casava.tar.gz - |~~~:> dbSNP142_human_common_all.tar.gz - - - Genomes - ~~~~~~~| - |~~~:> Human.GRCh37.75.tar.gz - |~~~:> Human.GRCh37.75_Y-Only.tar.gz - |~~~:> Human.GRCh38.78.tar.gz - |~~~:> Mouse.GRCm38.78.tar.gz + Available datawraps for boostraping + + SNPs + ~~~~| + |~~~:> Human_agnostic.dummySRY.tar.gz + |~~~:> Human.dummySRY_casava.tar.gz + |~~~:> dbSNP142_human_common_all.tar.gz + + + Genomes + ~~~~~~~| + |~~~:> Human.GRCh37.75.tar.gz + |~~~:> Human.GRCh37.75_Y-Only.tar.gz + |~~~:> Human.GRCh38.78.tar.gz + |~~~:> Mouse.GRCm38.78.tar.gz To get a list of remote datawraps that pyGeno can download for you, do: .. code:: python - B.printRemoteDatawraps() + B.printRemoteDatawraps() Importing whole genomes is a demanding process that take more than an hour and requires (according to tests) at least 3GB of memory. Depending on your configuration, more might be required. @@ -250,24 +258,24 @@ The bootstrap module also has some handy functions for importing built-in packag Some of them just for playing around with pyGeno (**Fast importation** and **Small memory requirements**): .. code:: python - - import pyGeno.bootstrap as B + + import pyGeno.bootstrap as B - #Imports only the Y chromosome from the human reference genome GRCh37.75 - #Very fast, requires even less memory. No download required. - B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") - - #A dummy datawrap for humans SNPs and Indels in pyGeno's AgnosticSNP format. - # This one has one SNP at the begining of the gene SRY - B.importSNPs("Human.dummySRY_casava.tar.gz") + #Imports only the Y chromosome from the human reference genome GRCh37.75 + #Very fast, requires even less memory. No download required. + B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") + + #A dummy datawrap for humans SNPs and Indels in pyGeno's AgnosticSNP format. + # This one has one SNP at the begining of the gene SRY + B.importSNPs("Human.dummySRY_casava.tar.gz") And for more **Serious Work**, the whole reference genome. .. code:: python - #Downloads the whole genome (205MB, sequences + annotations), may take an hour or more. - B.importGenome("Human.GRCh38.78.tar.gz") - + #Downloads the whole genome (205MB, sequences + annotations), may take an hour or more. + B.importGenome("Human.GRCh38.78.tar.gz") + Importing a custom datawrap: -------------------------- @@ -294,10 +302,10 @@ For more details on how datawraps are made you can check wiki_ or have a look in Instanciating a genome: ----------------------- .. code:: python - - from pyGeno.Genome import Genome - #the name of the genome is defined inside the package's manifest.ini file - ref = Genome(name = 'GRCh37.75') + + from pyGeno.Genome import Genome + #the name of the genome is defined inside the package's manifest.ini file + ref = Genome(name = 'GRCh37.75') Printing all the proteins of a gene: ----------------------------------- @@ -321,7 +329,7 @@ then: #get returns a list of elements gene = ref.get(Gene, name = 'TPST2')[0] for prot in gene.get(Protein) : - print prot.sequence + print(prot.sequence) Making queries, get() Vs iterGet(): ----------------------------------- @@ -358,31 +366,31 @@ For example,both "AGC" and "ATG" will match the following sequence "...AT/GCCG.. .. code:: python - #returns the position of the first occurence - transcript.find("AT/GCCG") - #returns the positions of all occurences - transcript.findAll("AT/GCCG") - - #similarly, you can also do - transcript.findIncDNA("AT/GCCG") - transcript.findAllIncDNA("AT/GCCG") - transcript.findInUTR3("AT/GCCG") - transcript.findAllInUTR3("AT/GCCG") - transcript.findInUTR5("AT/GCCG") - transcript.findAllInUTR5("AT/GCCG") - - #same for proteins - protein.find("DEV/RDEM") - protein.findAll("DEV/RDEM") - - #and for exons - exon.find("AT/GCCG") - exon.findAll("AT/GCCG") - exon.findInCDS("AT/GCCG") - exon.findAllInCDS("AT/GCCG") - #... - - + #returns the position of the first occurence + transcript.find("AT/GCCG") + #returns the positions of all occurences + transcript.findAll("AT/GCCG") + + #similarly, you can also do + transcript.findIncDNA("AT/GCCG") + transcript.findAllIncDNA("AT/GCCG") + transcript.findInUTR3("AT/GCCG") + transcript.findAllInUTR3("AT/GCCG") + transcript.findInUTR5("AT/GCCG") + transcript.findAllInUTR5("AT/GCCG") + + #same for proteins + protein.find("DEV/RDEM") + protein.findAll("DEV/RDEM") + + #and for exons + exon.find("AT/GCCG") + exon.findAll("AT/GCCG") + exon.findInCDS("AT/GCCG") + exon.findAllInCDS("AT/GCCG") + #... + + Progress Bar: ------------- .. code:: python @@ -390,6 +398,6 @@ Progress Bar: from pyGeno.tools.ProgressBar import ProgressBar pg = ProgressBar(nbEpochs = 155) for i in range(155) : - pg.update(label = '%d' %i) # or simply p.update() + pg.update(label = '%d' %i) # or simply p.update() pg.close() diff --git a/pyGeno/Chromosome.py b/pyGeno/Chromosome.py index 9482d94..369b14c 100644 --- a/pyGeno/Chromosome.py +++ b/pyGeno/Chromosome.py @@ -1,19 +1,19 @@ #import copy #import types #from tools import UsefulFunctions as uf - +from icecream import ic from types import * -import configuration as conf -from pyGenoObjectBases import * +from . import configuration as conf +from .pyGenoObjectBases import * -from SNP import * -import SNPFiltering as SF +from .SNP import * +from . import SNPFiltering as SF from rabaDB.filters import RabaQuery import rabaDB.fields as rf -from tools.SecureMmap import SecureMmap as SecureMmap -from tools import SingletonManager +from .tools.SecureMmap import SecureMmap as SecureMmap +from .tools import SingletonManager import pyGeno.configuration as conf @@ -21,7 +21,6 @@ class ChrosomeSequence(object) : """Represents a chromosome sequence. If 'refOnly' no ploymorphisms are applied and the ref sequence is always returned""" def __init__(self, data, chromosome, refOnly = False) : - self.data = data self.refOnly = refOnly self.chromosome = chromosome @@ -31,13 +30,13 @@ def setSNPFilter(self, SNPFilter) : self.SNPFilter = SNPFilter def getSequenceData(self, slic) : - data = self.data[slic] + data = self.data[slic].decode('utf-8') SNPTypes = self.chromosome.genome.SNPTypes if SNPTypes is None or self.refOnly : return data iterators = [] - for setName, SNPType in SNPTypes.iteritems() : + for setName, SNPType in SNPTypes.items() : f = RabaQuery(str(SNPType), namespace = self.chromosome._raba_namespace) chromosomeNumber = self.chromosome.number @@ -47,7 +46,7 @@ def getSequenceData(self, slic) : f.addFilter({'start >=' : slic.start, 'start <' : slic.stop, 'setName' : str(setName), 'chromosomeNumber' : chromosomeNumber}) # conf.db.enableDebug(True) - iterators.append(f.iterRun(sqlTail = 'ORDER BY start')) + iterators.append(f.run(sqlTail = 'ORDER BY start', gen=True)) if len(iterators) < 1 : return data @@ -65,7 +64,7 @@ def getSequenceData(self, slic) : polys[poly.start][poly.setName].append(poly) data = list(data) - for start, setPolys in polys.iteritems() : + for start, setPolys in polys.items() : seqPos = start - slic.start sequenceModifier = self.SNPFilter.filter(self.chromosome, **setPolys) @@ -121,7 +120,7 @@ class Chromosome(pyGenoRabaObjectWrapper) : def __init__(self, *args, **kwargs) : pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs) - path = '%s/chromosome%s.dat'%(self.genome.getSequencePath(), self.number) + path = '%s/chromosome%s.dat' % (self.genome.getSequencePath(), self.number) if not SingletonManager.contains(path) : datMap = SingletonManager.add(SecureMmap(path), path) else : @@ -140,9 +139,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : coolArgs['species'] = self.genome.species coolArgs['chromosomeNumber'] = self.number - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py index a08bcdc..929d090 100644 --- a/pyGeno/Exon.py +++ b/pyGeno/Exon.py @@ -1,9 +1,9 @@ -from pyGenoObjectBases import * -from SNP import SNP_INDEL - +from .pyGenoObjectBases import * +from .SNP import SNP_INDEL +from icecream import ic import rabaDB.fields as rf -from tools import UsefulFunctions as uf -from tools.BinarySequence import NucBinarySequence +from .tools import UsefulFunctions as uf +from .tools.BinarySequence import NucBinarySequence class Exon_Raba(pyGenoRabaObject) : """The wrapped Raba object that really holds the data""" @@ -63,9 +63,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : coolArgs['start >='] = self.start coolArgs['start <'] = self.end - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) @@ -76,7 +76,6 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : def _load_data(self) : data = self.chromosome.getSequenceData(slice(self.start,self.end)) - diffLen = (self.end-self.start) - len(data) if self.strand == '+' : diff --git a/pyGeno/Gene.py b/pyGeno/Gene.py index bb801df..02e0753 100644 --- a/pyGeno/Gene.py +++ b/pyGeno/Gene.py @@ -1,7 +1,7 @@ -import configuration as conf +from . import configuration as conf -from pyGenoObjectBases import * -from SNP import SNP_INDEL +from .pyGenoObjectBases import * +from .SNP import SNP_INDEL import rabaDB.fields as rf @@ -40,9 +40,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : coolArgs['start >='] = self.start coolArgs['start <'] = self.end - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) diff --git a/pyGeno/Genome.py b/pyGeno/Genome.py index df33b52..111d9c4 100644 --- a/pyGeno/Genome.py +++ b/pyGeno/Genome.py @@ -1,15 +1,14 @@ -import types -import configuration as conf +from . import configuration as conf import pyGeno.tools.UsefulFunctions as uf -from pyGenoObjectBases import * +from .pyGenoObjectBases import * -from Chromosome import Chromosome -from Gene import Gene -from Transcript import Transcript -from Protein import Protein -from Exon import Exon -import SNPFiltering as SF -from SNP import * +from .Chromosome import Chromosome +from .Gene import Gene +from .Transcript import Transcript +from .Protein import Protein +from .Exon import Exon +from . import SNPFiltering as SF +from .SNP import * import rabaDB.fields as rf @@ -18,7 +17,7 @@ def getGenomeList() : import rabaDB.filters as rfilt f = rfilt.RabaQuery(Genome_Raba) names = [] - for g in f.iterRun() : + for g in f.run(gen=True) : names.append(g.name) return names @@ -66,7 +65,7 @@ def __init__(self, SNPs = None, SNPFilter = None, *args, **kwargs) : pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs) - if type(SNPs) is types.StringType : + if type(SNPs) is str : self.SNPsSets = [SNPs] else : self.SNPsSets = SNPs @@ -102,9 +101,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : f = RabaQuery(objectType, namespace = self._wrapped_class._raba_namespace) coolArgs['species'] = self.species - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) diff --git a/pyGeno/Protein.py b/pyGeno/Protein.py index 278a8d8..0103391 100644 --- a/pyGeno/Protein.py +++ b/pyGeno/Protein.py @@ -1,12 +1,12 @@ -import configuration as conf +from . import configuration as conf -from pyGenoObjectBases import * -from SNP import SNP_INDEL +from .pyGenoObjectBases import * +from .SNP import SNP_INDEL import rabaDB.fields as rf -from tools import UsefulFunctions as uf -from tools.BinarySequence import AABinarySequence +from .tools import UsefulFunctions as uf +from .tools.BinarySequence import AABinarySequence import copy class Protein_Raba(pyGenoRabaObject) : @@ -43,9 +43,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : coolArgs['start >='] = self.transcript.start coolArgs['start <'] = self.transcript.end - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) @@ -59,12 +59,14 @@ def _load_sequences(self) : self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*') else: self.sequence = uf.translateDNA(self.transcript.cDNA, translTable_id='mt').rstrip('*') - def getSequence(self): return self.sequence + print("jizz") + print(self.sequence) def _load_bin_sequence(self) : + ll self.bin_sequence = AABinarySequence(self.sequence) def getDefaultSequence(self) : diff --git a/pyGeno/SNP.py b/pyGeno/SNP.py index b925114..5cad4d8 100644 --- a/pyGeno/SNP.py +++ b/pyGeno/SNP.py @@ -1,8 +1,6 @@ -import types +from . import configuration as conf -import configuration as conf - -from pyGenoObjectBases import * +from .pyGenoObjectBases import * import rabaDB.fields as rf # from tools import UsefulFunctions as uf @@ -20,7 +18,7 @@ def getSNPSetsList() : import rabaDB.filters as rfilt f = rfilt.RabaQuery(SNPMaster) names = [] - for g in f.iterRun() : + for g in f.run(gen=True) : names.append(g.setName) return names diff --git a/pyGeno/SNPFiltering.py b/pyGeno/SNPFiltering.py index 126f2a8..5a23b41 100644 --- a/pyGeno/SNPFiltering.py +++ b/pyGeno/SNPFiltering.py @@ -1,8 +1,6 @@ -import types +from . import configuration as conf -import configuration as conf - -from tools import UsefulFunctions as uf +from .tools import UsefulFunctions as uf class Sequence_modifiers(object) : """Abtract Class. All sequence must inherit from me""" @@ -17,7 +15,7 @@ class SequenceSNP(Sequence_modifiers) : """Represents a SNP to be applied to the sequence""" def __init__(self, alleles, sources = {}) : Sequence_modifiers.__init__(self, sources) - if type(alleles) is types.ListType : + if type(alleles) is list : self.alleles = uf.encodePolymorphicNucleotide(''.join(alleles)) else : self.alleles = uf.encodePolymorphicNucleotide(alleles) @@ -38,7 +36,7 @@ def __init__(self, bases, sources = {}, ref = '-') : #-1 because if the insertion are after the last nuc we go out of table self.offset -= 1 else: - raise NotImplemented("This format of Insetion is not accepted. Please change your format, or implement your format in pyGeno.") + raise NotImplementedError("This format of Insetion is not accepted. Please change your format, or implement your format in pyGeno.") class SequenceDel(Sequence_modifiers) : @@ -55,7 +53,7 @@ def __init__(self, length, sources = {}, ref = None, alt = '-') : self.offset = len(alt) self.length = self.length - len(alt) else: - raise NotImplemented("This format of Deletion is not accepted. Please change your format, or implement your format in pyGeno.") + raise NotImplementedError("This format of Deletion is not accepted. Please change your format, or implement your format in pyGeno.") else: raise Exception("You need to add a ref sequence in your call of SequenceDel. Or implement your format in pyGeno.") @@ -68,7 +66,7 @@ def __init__(self) : pass def filter(self, chromosome, **kwargs) : - raise NotImplemented("Must be implemented in child") + raise NotImplementedError("Must be implemented in child") class DefaultSNPFilter(SNPFilter) : """ @@ -124,7 +122,7 @@ def appendAllele(alleles, sources, snp) : warn = 'Warning: the default snp filter ignores indels. IGNORED %s of SNP set: %s at pos: %s of chromosome: %s' sources = {} alleles = [] - for snpSet, data in kwargs.iteritems() : + for snpSet, data in kwargs.items() : if type(data) is list : for snp in data : alleles, sources = appendAllele(alleles, sources, snp) diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index fafa26f..9637b81 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -1,14 +1,14 @@ -import configuration as conf +from . import configuration as conf -from pyGenoObjectBases import * +from .pyGenoObjectBases import * import rabaDB.fields as rf +from icecream import ic +from .tools import UsefulFunctions as uf +from .Exon import * +from .SNP import SNP_INDEL -from tools import UsefulFunctions as uf -from Exon import * -from SNP import SNP_INDEL - -from tools.BinarySequence import NucBinarySequence +from .tools.BinarySequence import NucBinarySequence class Transcript_Raba(pyGenoRabaObject) : @@ -69,9 +69,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : coolArgs['start >='] = self.start coolArgs['start <'] = self.end - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) @@ -114,7 +114,6 @@ def setV(k, v) : if e.CDS[ajusted_position] == 'T': e.CDS = list(e.CDS) e.CDS[ajusted_position] = '!' - if len(cDNA) == 0 and e.frame != 0 : e.CDS = e.CDS[e.frame:] @@ -122,6 +121,7 @@ def setV(k, v) : e.CDS_start += e.frame else: e.CDS_end -= e.frame + if len(e.CDS): cDNA.append(''.join(e.CDS)) @@ -164,7 +164,7 @@ def getCodon(self, i) : def iterCodons(self) : """iterates through the codons""" - for i in range(len(self.cDNA)/3) : + for i in range(len(self.cDNA)//3) : yield self.getCodon(i) def find(self, sequence) : diff --git a/pyGeno/__init__.py b/pyGeno/__init__.py index 1e82505..26b581f 100755 --- a/pyGeno/__init__.py +++ b/pyGeno/__init__.py @@ -1,4 +1,4 @@ __all__ = ['Genome', 'Chromosome', 'Gene', 'Transcript', 'Exon', 'Protein', 'SNP'] -from configuration import pyGeno_init +from .configuration import pyGeno_init pyGeno_init() diff --git a/pyGeno/bootstrap.py b/pyGeno/bootstrap.py index ee86228..97b394a 100644 --- a/pyGeno/bootstrap.py +++ b/pyGeno/bootstrap.py @@ -1,7 +1,8 @@ import pyGeno.importation.Genomes as PG import pyGeno.importation.SNPs as PS from pyGeno.tools.io import printf -import os, tempfile, urllib, urllib2, json +import os, tempfile, json +import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse import pyGeno.configuration as conf this_dir, this_filename = os.path.split(__file__) @@ -10,48 +11,48 @@ def listRemoteDatawraps(location = conf.pyGeno_REMOTE_LOCATION) : """Lists all the datawraps availabe from a remote a remote location.""" + print(location) loc = location + "/datawraps.json" - response = urllib2.urlopen(loc) + response = urllib.request.urlopen(loc) js = json.loads(response.read()) return js def printRemoteDatawraps(location = conf.pyGeno_REMOTE_LOCATION) : """ - print all available datawraps from a remote location the location must have a datawraps.json in the following format:: - - { - "Ordered": { - "Reference genomes": { - "Human" : ["GRCh37.75", "GRCh38.78"], - "Mouse" : ["GRCm38.78"], - }, - "SNPs":{ - } + print all available datawraps from a remote location the location must have a datawraps.json in the following format:: + + { + "Ordered": { + "Reference genomes": { + "Human" : ["GRCh37.75", "GRCh38.78"], + "Mouse" : ["GRCm38.78"] }, - "Flat":{ - "Reference genomes": { - "GRCh37.75": "Human.GRCh37.75.tar.gz", - "GRCh38.78": "Human.GRCh37.75.tar.gz", - "GRCm38.78": "Mouse.GRCm38.78.tar.gz" - }, - "SNPs":{ + "SNPs":{ } + }, + "Flat":{ + "Reference genomes": { + "GRCh37.75": "Human.GRCh37.75.tar.gz", + "GRCh38.78": "Human.GRCh37.75.tar.gz", + "GRCm38.78": "Mouse.GRCm38.78.tar.gz" + }, + "SNPs":{ } } - + """ l = listRemoteDatawraps(location) printf("Available datawraps for bootstraping\n") - print json.dumps(l["Ordered"], sort_keys=True, indent=4, separators=(',', ': ')) + print(json.dumps(l["Ordered"], sort_keys=True, indent=4, separators=(',', ': '))) def _DW(name, url) : packageDir = tempfile.mkdtemp(prefix = "pyGeno_remote_") printf("~~~:>\n\tDownloading datawrap: %s..." % name) finalFile = os.path.normpath('%s/%s' %(packageDir, name)) - urllib.urlretrieve (url, finalFile) + urllib.request.urlretrieve (url, finalFile) printf('\tdone.\n~~~:>') return finalFile @@ -92,7 +93,7 @@ def printDatawraps() : """print all available datawraps for bootstraping""" l = listDatawraps() printf("Available datawraps for boostraping\n") - for k, v in l.iteritems() : + for k, v in l.items() : printf(k) printf("~"*len(k) + "|") for vv in v : @@ -107,4 +108,4 @@ def importGenome(name, batchSize = 100) : def importSNPs(name) : """Import a SNP set shipped with pyGeno. Most of the datawraps only contain URLs towards data provided by third parties.""" path = os.path.join(this_dir, "bootstrap_data", "SNPs/" + name) - PS.importSNPs(path) \ No newline at end of file + PS.importSNPs(path) diff --git a/pyGeno/bootstrap_data/SNPs/__init__.py b/pyGeno/bootstrap_data/SNPs/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyGeno/bootstrap_data/__init__.py b/pyGeno/bootstrap_data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyGeno/bootstrap_data/genomes/Human.GRCh38.98.tar.gz b/pyGeno/bootstrap_data/genomes/Human.GRCh38.98.tar.gz new file mode 100644 index 0000000..9b6e6ef Binary files /dev/null and b/pyGeno/bootstrap_data/genomes/Human.GRCh38.98.tar.gz differ diff --git a/pyGeno/bootstrap_data/genomes/Mouse.GRCm38.98.tar.gz b/pyGeno/bootstrap_data/genomes/Mouse.GRCm38.98.tar.gz new file mode 100644 index 0000000..bb5e2d4 Binary files /dev/null and b/pyGeno/bootstrap_data/genomes/Mouse.GRCm38.98.tar.gz differ diff --git a/pyGeno/bootstrap_data/genomes/__init__.py b/pyGeno/bootstrap_data/genomes/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyGeno/bootstrap_data/genomes/datawraps.json b/pyGeno/bootstrap_data/genomes/datawraps.json new file mode 100644 index 0000000..7000c9b --- /dev/null +++ b/pyGeno/bootstrap_data/genomes/datawraps.json @@ -0,0 +1,22 @@ +{ + "Ordered": { + "Reference genomes": { + "Human" : ["GRCh37.75", "GRCh38.78", "GRCh38.98"], + "Mouse" : ["GRCm38.78", "GRCm38.98"] + }, + "SNPs":{ + } + }, + "Flat":{ + "Reference genomes": { + "GRCh37.75": "Human.GRCh37.75.tar.gz", + "GRCh38.78": "Human.GRCh38.78.tar.gz", + "GRCh38.98": "Human.GRCh38.98.tar.gz", + "GRCm38.78": "Mouse.GRCm38.78.tar.gz", + "GRCm38.98": "Mouse.GRCm38.98.tar.gz" + }, + "SNPs":{ + } + } +} + diff --git a/pyGeno/configuration.py b/pyGeno/configuration.py index e3ad507..be68b33 100644 --- a/pyGeno/configuration.py +++ b/pyGeno/configuration.py @@ -1,5 +1,5 @@ import sys, os, time -from ConfigParser import SafeConfigParser +from configparser import ConfigParser import rabaDB.rabaSetup import rabaDB.Raba @@ -20,7 +20,7 @@ class PythonVersionError(Exception) : pyGeno_SETTINGS_PATH = None pyGeno_RABA_DBFILE = None pyGeno_DATA_PATH = None -pyGeno_REMOTE_LOCATION = 'http://bioinfo.iric.ca/~daoudat/pyGeno_datawraps' +pyGeno_REMOTE_LOCATION = 'http://bioinfo.iric.ca/~feghalya/pyGeno_datawraps' db = None #proxy for the raba database dbConf = None #proxy for the raba database configuration @@ -34,9 +34,9 @@ def prettyVersion() : return "pyGeno %s Branch: %s, Name: %s, Release Level: %s, Version: %s, Build time: %s" % version() def checkPythonVersion() : - """pyGeno needs python 2.7+""" + """pyGeno needs python 3.5+""" - if sys.version_info[0] < 2 or (sys.version_info[0] > 2 and sys.version_info[1] < 7) : + if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 5) : return False return True @@ -52,7 +52,7 @@ def createDefaultConfigFile() : def getSettingsPath() : """Returns the path where the settings are stored""" - parser = SafeConfigParser() + parser = ConfigParser() try : parser.read(os.path.normpath(pyGeno_SETTINGS_DIR+'/config.ini')) return parser.get('pyGeno_config', 'settings_dir') @@ -83,7 +83,7 @@ def pyGeno_init() : global pyGeno_DATA_PATH if not checkPythonVersion() : - raise PythonVersionError("==> FATAL: pyGeno only works with python 2.7 and above, please upgrade your python version") + raise PythonVersionError("==> FATAL: pyGeno only works with python 3.5 and above, please upgrade your python version") if not os.path.exists(pyGeno_SETTINGS_DIR) : os.makedirs(pyGeno_SETTINGS_DIR) @@ -101,4 +101,4 @@ def pyGeno_init() : #launching the db rabaDB.rabaSetup.RabaConfiguration(pyGeno_RABA_NAMESPACE, pyGeno_RABA_DBFILE) db = rabaDB.rabaSetup.RabaConnection(pyGeno_RABA_NAMESPACE) - dbConf = rabaDB.rabaSetup.RabaConfiguration(pyGeno_RABA_NAMESPACE) \ No newline at end of file + dbConf = rabaDB.rabaSetup.RabaConfiguration(pyGeno_RABA_NAMESPACE) diff --git a/pyGeno/doc/source/_static/logo.png b/pyGeno/doc/source/_static/logo.png new file mode 100644 index 0000000..a0f2046 Binary files /dev/null and b/pyGeno/doc/source/_static/logo.png differ diff --git a/pyGeno/doc/source/conf.py b/pyGeno/doc/source/conf.py index dd961ff..3aa2fbe 100644 --- a/pyGeno/doc/source/conf.py +++ b/pyGeno/doc/source/conf.py @@ -54,17 +54,17 @@ master_doc = 'index' # General information about the project. -project = u'pyGeno' -copyright = u'2014, Tariq Daouda' +project = 'pyGeno' +copyright = '2020, Tariq Daouda' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '1.x' +version = '2.x' # The full version, including alpha/beta/rc tags. -release = '1.2.x' +release = '2.0.x' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -137,7 +137,7 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. -html_logo = "logo.png" +html_logo = "_static/logo.png" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 @@ -232,8 +232,8 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'pyGeno.tex', u'pyGeno Documentation', - u'Tariq Daouda', 'manual'), + ('index', 'pyGeno.tex', 'pyGeno Documentation', + 'Tariq Daouda', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -262,8 +262,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pygeno', u'pyGeno Documentation', - [u'Tariq Daouda'], 1) + ('index', 'pygeno', 'pyGeno Documentation', + ['Tariq Daouda'], 1) ] # If true, show URL addresses after external links. @@ -276,8 +276,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pyGeno', u'pyGeno Documentation', - u'Tariq Daouda', 'pyGeno', 'One line description of project.', + ('index', 'pyGeno', 'pyGeno Documentation', + 'Tariq Daouda', 'pyGeno', 'One line description of project.', 'Miscellaneous'), ] @@ -297,10 +297,10 @@ # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u'pyGeno' -epub_author = u'Tariq Daouda' -epub_publisher = u'Tariq Daouda' -epub_copyright = u'2014, Tariq Daouda' +epub_title = 'pyGeno' +epub_author = 'Tariq Daouda' +epub_publisher = 'Tariq Daouda' +epub_copyright = '2014, Tariq Daouda' # The basename for the epub file. It defaults to the project name. #epub_basename = u'pyGeno' diff --git a/pyGeno/doc/source/index.rst b/pyGeno/doc/source/index.rst index 5dd9de8..976b21f 100644 --- a/pyGeno/doc/source/index.rst +++ b/pyGeno/doc/source/index.rst @@ -3,17 +3,14 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -.. image:: http://bioinfo.iric.ca/~daoudat/pyGeno/_static/logo.png +.. image:: _static/logo.png :alt: pyGeno's logo pyGeno: A Python package for precision medicine and proteogenomics =================================================================== -.. image:: http://depsy.org/api/package/pypi/pyGeno/badge.svg - :alt: depsy - :target: http://depsy.org/package/python/pyGeno .. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg :target: https://opensource.org/licenses/Apache-2.0 -.. image:: https://img.shields.io/badge/python-2.7-blue.svg +.. image:: https://img.shields.io/badge/python-3.5-blue.svg pyGeno's `lair is on Github`_. @@ -42,27 +39,28 @@ be able cope with huge queries. .. code:: - from pyGeno.Genome import * + from pyGeno.Genome import * - g = Genome(name = "GRCh37.75") - prot = g.get(Protein, id = 'ENSP00000438917')[0] - #print the protein sequence - print prot.sequence - #print the protein's gene biotype - print prot.gene.biotype - #print protein's transcript sequence - print prot.transcript.sequence + # In case the Genome cannot be found (i.e., KeyError), please consider the below section on bootstrapping + g = Genome(name = "GRCh37.75") + prot = g.get(Protein, id = 'ENSP00000438917')[0] + #print the protein sequence + print(prot.sequence) + #print the protein's gene biotype + print(prot.gene.biotype) + #print protein's transcript sequence + print(prot.transcript.sequence) - #fancy queries - for exons in g.get(Exons, {"CDS_start >": x1, "CDS_end <=" : x2, "chromosome.number" : "22"}) : - #print the exon's coding sequence - print exon.CDS - #print the exon's transcript sequence - print exon.transcript.sequence + #fancy queries + for exons in g.get(Exons, {"CDS_start >": x1, "CDS_end <=" : x2, "chromosome.number" : "22"}) : + #print the exon's coding sequence + print(exon.CDS) + #print the exon's transcript sequence + print(exon.transcript.sequence) - #You can do the same for your subject specific genomes - #by combining a reference genome with polymorphisms - g = Genome(name = "GRCh37.75", SNPs = ["STY21_RNA"], SNPFilter = MyFilter()) + #You can do the same for your subject specific genomes + #by combining a reference genome with polymorphisms + g = Genome(name = "GRCh37.75", SNPs = ["STY21_RNA"], SNPFilter = MyFilter()) Verbose Introduction @@ -81,7 +79,7 @@ pyGeno is a python package that was designed to be: * Fast to install. It has no dependencies but its own backend: `rabaDB`_. * Fast to run and memory efficient, so you can use it on your laptop. * Fast to use. No queries to foreign APIs all the data rests on your computer, so it is readily accessible when you need it. -* Fast to learn. One single function **get()** can do the job of several other tools at once. +* Fast to learn. One single function **get()** can do the job of several other tools at once. It also comes with: @@ -107,7 +105,7 @@ Contents: .. toctree:: :maxdepth: 2 - + publications quickstart installation @@ -119,6 +117,7 @@ Contents: snp_filter tools parsers + citing Indices and tables ================== diff --git a/pyGeno/doc/source/installation.rst b/pyGeno/doc/source/installation.rst index bafbf9e..b16a583 100644 --- a/pyGeno/doc/source/installation.rst +++ b/pyGeno/doc/source/installation.rst @@ -25,16 +25,16 @@ If you're more adventurous, the bleeding edge version is available from github ( Windows ------- -* Goto: https://www.python.org/downloads/ and download the installer for the lastest version of python 2.7 +* Goto: https://www.python.org/downloads/ and download the installer for the lastest version of python 3.5 * Double click on the installer to begin installation * Click on the windows start menu * Type *"cmd"* and click on it to launch the command line interface * In the command line interface type:: - cd C:\Python27\Scripts + cd C:\Python35\Scripts * Now type: pip install pyGeno -* Now click on the windows start menu. In the python 2.7 menu you can either launch *Python (Command line)* or *IDLE (Python GUI)* +* Now click on the windows start menu. In the python 3.5 menu you can either launch *Python (Command line)* or *IDLE (Python GUI)* * You can now go to: http://pygeno.iric.ca/quickstart.html and type the commands into either one of them **UPGRADE:** to upgrade pyGeno to the latest version, launch *cmd* and type:: diff --git a/pyGeno/doc/source/objects.rst b/pyGeno/doc/source/objects.rst index fd2ed98..7ece447 100644 --- a/pyGeno/doc/source/objects.rst +++ b/pyGeno/doc/source/objects.rst @@ -6,42 +6,42 @@ Base classes ------------- Base classes are abstract and are not meant to be instanciated, they nonetheless implement most of the functions that classes such as Genome possess. -.. automodule:: pyGenoObjectBases +.. automodule:: pyGeno.pyGenoObjectBases :members: Genome ------- -.. automodule:: Genome +.. automodule:: pyGeno.Genome :members: Chromosome ---------- -.. automodule:: Chromosome +.. automodule:: pyGeno.Chromosome :members: Gene ---- -.. automodule:: Gene +.. automodule:: pyGeno.Gene :members: Transcript ---------- -.. automodule:: Transcript +.. automodule:: pyGeno.Transcript :members: Exon ---- -.. automodule:: Exon +.. automodule:: pyGeno.Exon :members: Protein ------- -.. automodule:: Protein +.. automodule:: pyGeno.Protein :members: SNP --- -.. automodule:: SNP +.. automodule:: pyGeno.SNP :members: diff --git a/pyGeno/doc/source/quickstart.rst b/pyGeno/doc/source/quickstart.rst index 8a3e17e..d2bf20a 100644 --- a/pyGeno/doc/source/quickstart.rst +++ b/pyGeno/doc/source/quickstart.rst @@ -28,7 +28,9 @@ pyGeno comes with a few datawraps, to get the list you can use: |~~~:> Human.GRCh37.75.tar.gz |~~~:> Human.GRCh37.75_Y-Only.tar.gz |~~~:> Human.GRCh38.78.tar.gz + |~~~:> Human.GRCh38.98.tar.gz |~~~:> Mouse.GRCm38.78.tar.gz + |~~~:> Mouse.GRCm38.98.tar.gz To get a list of remote datawraps that pyGeno can download for you, do: @@ -77,7 +79,7 @@ That's it, you can now print the sequences of all the proteins that a gene can p #get returns a list of elements gene = ref.get(Gene, name = 'SRY')[0] for prot in gene.get(Protein) : - print prot.sequence + print(prot.sequence) You can see pyGeno achitecture as a graph where everything is connected to everything. For instance you can do things such as:: @@ -89,6 +91,9 @@ You can see pyGeno achitecture as a graph where everything is connected to every Queries ------- +Note that the way queries are handled is changing + Since pyGeno v1.4 the default method is to use generators + PyGeno allows for several kinds of queries, here are some snippets:: #in this case both queries will yield the same result @@ -109,10 +114,13 @@ To know the available fields for queries, there's a "help()" function:: Faster queries --------------- -To speed up loops use iterGet():: +Note that the way queries are handled is changing + Since pyGeno v1.4 the default method is to use generators + +To speed up loops use get(gen=True):: - for prot in gene.iterGet(Protein) : - print prot.sequence + for prot in gene.get(Protein, gen=True) : + print(prot.sequence) For more speed create indexes on the fields you need the most:: @@ -120,6 +128,7 @@ For more speed create indexes on the fields you need the most:: Getting sequences + ------------------- Anything that has a sequence can be indexed using the usual python list syntax:: diff --git a/pyGeno/doc/source/snp_filter.rst b/pyGeno/doc/source/snp_filter.rst index b9eaaa9..faa6de5 100644 --- a/pyGeno/doc/source/snp_filter.rst +++ b/pyGeno/doc/source/snp_filter.rst @@ -4,5 +4,5 @@ Filtering Polymorphisms (SNPs and Indels) Polymorphism filtering is an important part of personalized genomes. By creating your own filters you can easily taylor personalized genomes to your needs. The importaant thing to understand about the filtering process, is that it gives you complete freedom about should be inserted. Once pyGeno finds a polymorphism, it automatically triggers the filter to know what should be inserted at this position, and that can be anything you choose. -.. automodule:: SNPFiltering +.. automodule:: pyGeno.SNPFiltering :members: diff --git a/pyGeno/examples/genomic_graph.ipynb b/pyGeno/examples/genomic_graph.ipynb new file mode 100644 index 0000000..b6257a9 --- /dev/null +++ b/pyGeno/examples/genomic_graph.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyGeno.Genome as pgg\n", + "import pyGeno.tools.UsefulFunctions as uf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize genome and select gene" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "genome_name = 'GRCh38.98'\n", + "gene_name = 'POMP'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.91 ms, sys: 0 ns, total: 1.91 ms\n", + "Wall time: 15.8 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "ref = pgg.Genome(name=genome_name)\n", + "gene = ref.get(pgg.Gene, name=gene_name, gen=False)[0] # gen=False returns list, not generator" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Strand: +\n" + ] + } + ], + "source": [ + "print('Strand:', gene.strand)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From transcripts to exons" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ENST00000460403\n", + " > ENSE00003538396\n", + " > ENSE00001927741\n", + " > ENSE00003641576\n", + " > ENSE00003658469\n", + " > ENSE00003634423\n", + " > ENSE00003676638\n", + " > ENSE00003621088\n", + "ENST00000380842\n", + " > ENSE00003667813\n", + " > ENSE00003577317\n", + " > ENSE00003620057\n", + " > ENSE00003521643\n", + " > ENSE00003676638\n", + " > ENSE00003849438\n", + "CPU times: user 13.5 ms, sys: 1.94 ms, total: 15.4 ms\n", + "Wall time: 48.1 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "for transcript in gene.get(pgg.Transcript):\n", + " print(transcript.id)\n", + " for exon in transcript.get(pgg.Exon, gen=True):\n", + " print(\" >\", exon.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From exons to transcripts" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'CDS': defaultdict(list,\n", + " {'ENSE00003634423': ['ENST00000460403'],\n", + " 'ENSE00003676638': ['ENST00000460403', 'ENST00000380842'],\n", + " 'ENSE00003621088': ['ENST00000460403'],\n", + " 'ENSE00003667813': ['ENST00000380842'],\n", + " 'ENSE00003577317': ['ENST00000380842'],\n", + " 'ENSE00003620057': ['ENST00000380842'],\n", + " 'ENSE00003521643': ['ENST00000380842'],\n", + " 'ENSE00003849438': ['ENST00000380842']}),\n", + " 'NotCDS': defaultdict(list,\n", + " {'ENSE00003538396': ['ENST00000460403'],\n", + " 'ENSE00001927741': ['ENST00000460403'],\n", + " 'ENSE00003641576': ['ENST00000460403'],\n", + " 'ENSE00003658469': ['ENST00000460403']})}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from collections import defaultdict\n", + "exon_dict = {'CDS': defaultdict(list), 'NotCDS': defaultdict(list)}\n", + "for exon in gene.get(pgg.Exon, gen=True):\n", + " exon_dict['CDS' if exon.hasCDS() else 'NotCDS'][exon.id].append(exon.transcript.id)\n", + "exon_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Choose a coding exon" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# choose a coding exon\n", + "exon_id = list(exon_dict['CDS'].keys())[0]\n", + "exon = gene.get(pgg.Exon, id=exon_id, gen=False)[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Peep into the object structure" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UTR5: TTCCAGCTCAACCAAGATAAA\n", + "CDS: ATGAATTTTTCCACACTGAGAAACATTCAGGGTCTATTTGCTCCGCTAAAATTACAGATGGAATTCAAGGCAGTGCAGCAG\n", + "UTR3: \n", + "\n", + "sequence: TTCCAGCTCAACCAAGATAAAATGAATTTTTCCACACTGAGAAACATTCAGGGTCTATTTGCTCCGCTAAAATTACAGATGGAATTCAAGGCAGTGCAGCAG\n" + ] + } + ], + "source": [ + "print('UTR5:', exon.UTR5)\n", + "print('CDS:', exon.CDS) \n", + "print('UTR3:', exon.UTR3)\n", + "print()\n", + "print('sequence:', exon.sequence)\n", + "\n", + "assert exon.sequence == ''.join(exon.UTR5 + exon.CDS + exon.UTR3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Translate in 6 frames" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('MNFSTLRNIQGLFAPLKLQMEFKAVQQ',\n", + " '*IFPH*ETFRVYLLR*NYRWNSRQCS',\n", + " 'EFFHTEKHSGSICSAKITDGIQGSAA',\n", + " 'LLHCLEFHL*F*RSK*TLNVSQCGKIH',\n", + " 'CCTALNSICNFSGANRP*MFLSVEKF',\n", + " 'AALP*IPSVILAEQIDPECFSVWKNS')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uf.translateDNA_6Frames(exon.CDS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Easily get the protein sequence corresponding to the transcript containing that exon" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'MNFSTLRNIQGLFAPLKLQMEFKAVQQVQRLPFLSSSNLSLDVLRGNDETIGFEDILNDPSQSEVMGEPHLMVEYKLGLL'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exon.transcript.protein.sequence" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pygeno (Python 3.7.1)", + "language": "python", + "name": "pygeno-python3.7.1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyGeno/examples/snps_queries.py b/pyGeno/examples/snps_queries.py index f195891..adabe0e 100644 --- a/pyGeno/examples/snps_queries.py +++ b/pyGeno/examples/snps_queries.py @@ -8,33 +8,32 @@ from pyGeno.SNPFiltering import SequenceSNP def printing(gene) : - print "printing reference sequences\n-------" + print("printing reference sequences\n-------") for trans in gene.get(Transcript) : - print "\t-Transcript name:", trans.name - print "\t-Protein:", trans.protein.sequence - print "\t-Exons:" + print("\t-Transcript name:", trans.name) + print("\t-Protein:", trans.protein.sequence) + print("\t-Exons:") for e in trans.exons : - print "\t\t Number:", e.number - print "\t\t-CDS:", e.CDS - print "\t\t-Strand:", e.strand - print "\t\t-CDS_start:", e.CDS_start - print "\t\t-CDS_end:", e.CDS_end + print("\t\t Number:", e.number) + print("\t\t-CDS:", e.CDS) + print("\t\t-Strand:", e.strand) + print("\t\t-CDS_start:", e.CDS_start) + print("\t\t-CDS_end:", e.CDS_end) def printVs(refGene, presGene) : - print "Vs personalized sequences\n------" + print("Vs personalized sequences\n------") - #iterGet returns an iterator instead of a list (faster) - for trans in presGene.iterGet(Transcript) : + for trans in presGene.get(Transcript, gen=True): refProt = refGene.get(Protein, id = trans.protein.id)[0] persProt = trans.protein - print persProt.id - print "\tref:" + refProt.sequence[:20] + "..." - print "\tper:" + persProt.sequence[:20] + "..." - print + print(persProt.id) + print("\tref:" + refProt.sequence[:20] + "...") + print("\tper:" + persProt.sequence[:20] + "...") + print() def fancyExonQuery(gene) : e = gene.get(Exon, {'CDS_start >' : 2655029, 'CDS_end <' : 2655200})[0] - print "An exon with a CDS in ']2655029, 2655200[':", e.id + print("An exon with a CDS in ']2655029, 2655200[':", e.id) class QMax_gt_filter(SNPFilter) : @@ -55,7 +54,7 @@ def filter(self, chromosome, dummySRY) : persGene = persGenome.get(Gene, name = 'SRY')[0] printing(geneRef) - print "\n" + print("\n") printVs(geneRef, persGene) - print "\n" + print("\n") fancyExonQuery(geneRef) diff --git a/pyGeno/importation/Genomes.py b/pyGeno/importation/Genomes.py index d992832..b0b5b84 100644 --- a/pyGeno/importation/Genomes.py +++ b/pyGeno/importation/Genomes.py @@ -1,6 +1,7 @@ -import os, glob, gzip, tarfile, shutil, time, sys, gc, cPickle, tempfile, urllib2 +import os, glob, gzip, tarfile, shutil, time, sys, gc, pickle, tempfile +import urllib.request, urllib.error, urllib.parse from contextlib import closing -from ConfigParser import SafeConfigParser +from configparser import ConfigParser from pyGeno.tools.ProgressBar import ProgressBar import pyGeno.configuration as conf @@ -44,10 +45,10 @@ def _getFile(fil, directory) : if fil.find("http://") == 0 or fil.find("ftp://") == 0 : printf("Downloading file: %s..." % fil) finalFile = os.path.normpath('%s/%s' %(directory, fil.split('/')[-1])) - # urllib.urlretrieve (fil, finalFile) - with closing(urllib2.urlopen(fil)) as r: - with open(finalFile, 'wb') as f: - shutil.copyfileobj(r, f) + urllib.request.urlretrieve (fil, finalFile) + #with closing(urllib.request.urlopen(fil)) as r: + # with open(finalFile, 'wb') as f: + # shutil.copyfileobj(r, f) printf('done.') else : @@ -57,7 +58,6 @@ def _getFile(fil, directory) : def deleteGenome(species, name) : """Removes a genome from the database""" - printf('deleting genome (%s, %s)...' % (species, name)) conf.db.beginTransaction() @@ -71,7 +71,7 @@ def deleteGenome(species, name) : pBar.update() f = RabaQuery(typ, namespace = genome._raba_namespace) f.addFilter({'genome' : genome}) - for e in f.iterRun() : + for e in f.run(gen=True) : objs.append(e) pBar.close() @@ -146,7 +146,7 @@ def reformatItems(items) : isDir = True packageDir = packageFile - parser = SafeConfigParser() + parser = ConfigParser() parser.read(os.path.normpath(packageDir+'/manifest.ini')) packageInfos = parser.items('package_infos') @@ -215,16 +215,16 @@ def __init__(self, conf) : def batch_save(self) : self.conf.db.beginTransaction() - for c in self.genes.itervalues() : + for c in self.genes.values() : c.save() conf.removeFromDBRegistery(c) - for c in self.transcripts.itervalues() : + for c in self.transcripts.values() : c.save() conf.removeFromDBRegistery(c.exons) conf.removeFromDBRegistery(c) - for c in self.proteins.itervalues() : + for c in self.proteins.values() : c.save() conf.removeFromDBRegistery(c) @@ -244,7 +244,7 @@ def batch_save(self) : def save_chros(self) : pBar = ProgressBar(nbEpochs = len(self.chromosomes)) - for c in self.chromosomes.itervalues() : + for c in self.chromosomes.values() : pBar.update(label = 'Chr %s' % c.number) c.save() pBar.close() @@ -253,7 +253,7 @@ def save_chros(self) : printf('Backuping indexes...') indexes = conf.db.getIndexes() - printf("Droping all your indexes, (don't worry i'll restore them later)...") + printf("Dropping all your indexes (don't worry I'll restore them later)...") Genome_Raba.flushIndexes() Chromosome_Raba.flushIndexes() Gene_Raba.flushIndexes() @@ -306,9 +306,9 @@ def save_chros(self) : printf('\tGene %s, %s...' % (geneId, geneName)) store.genes[geneId] = Gene_Raba() store.genes[geneId].set(genome = genome, id = geneId, chromosome = store.chromosomes[chroNumber], name = geneName, strand = strand, biotype = gene_biotype) - if start < store.genes[geneId].start or store.genes[geneId].start is None : + if store.genes[geneId].start is None or start < store.genes[geneId].start: store.genes[geneId].start = start - if end > store.genes[geneId].end or store.genes[geneId].end is None : + if store.genes[geneId].end is None or end > store.genes[geneId].end: store.genes[geneId].end = end try : transId = line['transcript_id'] @@ -329,9 +329,9 @@ def save_chros(self) : printf('\t\tTranscript %s, %s...' % (transId, transName)) store.transcripts[transId] = Transcript_Raba() store.transcripts[transId].set(genome = genome, id = transId, chromosome = store.chromosomes[chroNumber], gene = store.genes.get(geneId, None), name = transName, biotype=transcript_biotype) - if start < store.transcripts[transId].start or store.transcripts[transId].start is None: + if store.transcripts[transId].start is None or start < store.transcripts[transId].start: store.transcripts[transId].start = start - if end > store.transcripts[transId].end or store.transcripts[transId].end is None: + if store.transcripts[transId].end is None or end > store.transcripts[transId].end: store.transcripts[transId].end = end try : @@ -376,9 +376,9 @@ def save_chros(self) : pass if regionType == 'exon' : - if start < store.exons[exonKey].start or store.exons[exonKey].start is None: + if store.exons[exonKey].start is None or start < store.exons[exonKey].start: store.exons[exonKey].start = start - if end > store.transcripts[transId].end or store.exons[exonKey].end is None: + if store.exons[exonKey].end is None or end > store.transcripts[transId].end: store.exons[exonKey].end = end elif regionType == 'CDS' : store.exons[exonKey].CDS_start = start @@ -439,13 +439,13 @@ def save_chros(self) : printf('commiting changes...') conf.db.endTransaction() - return store.chromosomes.values() + return list(store.chromosomes.values()) #~ @profile def _importSequence(chromosome, fastaFile, targetDir) : "Serializes fastas into .dat files" - f = gzip.open(fastaFile) + f = gzip.open(fastaFile, 'rt') header = f.readline() strRes = f.read().upper().replace('\n', '').replace('\r', '') f.close() diff --git a/pyGeno/importation/SNPs.py b/pyGeno/importation/SNPs.py index 606a5cf..4898dea 100644 --- a/pyGeno/importation/SNPs.py +++ b/pyGeno/importation/SNPs.py @@ -1,11 +1,11 @@ -import urllib, shutil +import urllib.request, urllib.parse, urllib.error, shutil -from ConfigParser import SafeConfigParser +from configparser import ConfigParser import pyGeno.configuration as conf from pyGeno.SNP import * from pyGeno.tools.ProgressBar import ProgressBar from pyGeno.tools.io import printf -from Genomes import _decompressPackage, _getFile +from .Genomes import _decompressPackage, _getFile from pyGeno.tools.parsers.CasavaTools import SNPsTxtFile from pyGeno.tools.parsers.VCFTools import VCFFile @@ -43,7 +43,7 @@ def importSNPs(packageFile) : if not os.path.isfile(fpMan) : raise ValueError("Not file named manifest.ini! Mais quel SCANDALE!!!!") - parser = SafeConfigParser() + parser = ConfigParser() parser.read(os.path.normpath(packageDir+'/manifest.ini')) packageInfos = parser.items('package_infos') diff --git a/pyGeno/pyGenoObjectBases.py b/pyGeno/pyGenoObjectBases.py index 504c2f2..b74d6da 100644 --- a/pyGeno/pyGenoObjectBases.py +++ b/pyGeno/pyGenoObjectBases.py @@ -1,5 +1,5 @@ -import time, types, string -import configuration as conf +import time, warnings +from . import configuration as conf from rabaDB.rabaSetup import * from rabaDB.Raba import * from rabaDB.filters import RabaQuery @@ -60,12 +60,11 @@ def __getattr__(self, name) : rl = object.__getattribute__(self, 'rl') return getattr(rl, name) -class pyGenoRabaObjectWrapper(object) : +class pyGenoRabaObjectWrapper(object, metaclass=pyGenoRabaObjectWrapper_metaclass) : """All the wrapper classes such as Genome and Chromosome inherit from this class. It has most that make pyGeno useful, such as get(), count(), ensureIndex(). This class is to be considered abstract, and is not meant to be instanciated""" - __metaclass__ = pyGenoRabaObjectWrapper_metaclass _wrapped_class = None _bags = {} @@ -100,9 +99,9 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : f = RabaQuery(objectType._wrapped_class, namespace = self._wrapped_class._raba_namespace) coolArgs[self._wrapped_class.__name__[:-5]] = self.wrapped_object #[:-5] removes _Raba from class name - if len(args) > 0 and type(args[0]) is types.ListType : + if len(args) > 0 and type(args[0]) is list : for a in args[0] : - if type(a) is types.DictType : + if type(a) is dict : f.addFilter(**a) else : f.addFilter(*args, **coolArgs) @@ -113,7 +112,7 @@ def count(self, objectType, *args, **coolArgs) : """Returns the number of elements satisfying the query""" return self._makeLoadQuery(objectType, *args, **coolArgs).count() - def get(self, objectType, *args, **coolArgs) : + def get(self, objectType, *args, gen=False, **coolArgs) : """Raba Magic inside. This is th function that you use for querying pyGeno's DB. @@ -125,22 +124,33 @@ def get(self, objectType, *args, **coolArgs) : * myGenome.get(Transcript, {'start >' : x, 'end <' : y})""" - ret = [] - for e in self._makeLoadQuery(objectType, *args, **coolArgs).iterRun() : - if issubclass(objectType, pyGenoRabaObjectWrapper) : - ret.append(objectType(wrapped_object_and_bag = (e, self.bagKey))) - else : - ret.append(e) - - return ret - + warnings.warn("At some point in the future 'get' will be returning a generator by default (gen=True).\n" +\ + "If you want to use the output in a list format, you can just wrap your call with list().\n" +\ + "These changes are in conformity with python3 as opposed to the now-deprecated python2.", DeprecationWarning) + + if gen: + return self.iterGet(objectType, *args, **coolArgs) + else: + ret = [] + for e in self._makeLoadQuery(objectType, *args, **coolArgs).run(gen=True): + if issubclass(objectType, pyGenoRabaObjectWrapper) : + ret.append(objectType(wrapped_object_and_bag = (e, self.bagKey))) + else : + ret.append(e) + + return ret + def iterGet(self, objectType, *args, **coolArgs) : """Same as get. But retuns the elements one by one, much more efficient for large outputs""" - for e in self._makeLoadQuery(objectType, *args, **coolArgs).iterRun() : + warnings.warn("At some point in the future 'iterGet' will stop existing and will get replaced by 'get'.\n" +\ + "If you have functions still using the iter method, you should start changing them now.\n" +\ + "These changes are in conformity with python3 as opposed to the now-deprecated python2.", DeprecationWarning) + + for e in self._makeLoadQuery(objectType, *args, **coolArgs).run(gen=True): if issubclass(objectType, pyGenoRabaObjectWrapper) : yield objectType(wrapped_object_and_bag = (e, self.bagKey)) - else : + else: yield e #~ def ensureIndex(self, fields) : diff --git a/pyGeno/tests/test_binary_sequence.py b/pyGeno/tests/test_binary_sequence.py new file mode 100644 index 0000000..b90a770 --- /dev/null +++ b/pyGeno/tests/test_binary_sequence.py @@ -0,0 +1,183 @@ +import unittest +import sys +import os + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.BinarySequence import NucBinarySequence, AABinarySequence + + +class TestNucBinarySequenceEncode(unittest.TestCase): + + def test_simple_sequence(self): + bs = NucBinarySequence("ATCG") + self.assertEqual(len(bs), 4) + self.assertEqual(bs.getDefaultSequence(), "ATCG") + + def test_empty_polymorphisms(self): + bs = NucBinarySequence("ATCG") + self.assertEqual(bs.getPolymorphisms(), []) + + def test_polymorphic_sequence(self): + bs = NucBinarySequence("R") # R = A/G + polys = bs.getPolymorphisms() + self.assertEqual(len(polys), 1) + self.assertEqual(polys[0][0], 0) # position 0 + self.assertIn("A", polys[0][1]) + self.assertIn("G", polys[0][1]) + + def test_polymorphic_n(self): + bs = NucBinarySequence("N") # N = A/C/G/T + polys = bs.getPolymorphisms() + self.assertEqual(len(polys), 1) + self.assertEqual(len(polys[0][1]), 4) + + def test_mixed_sequence(self): + bs = NucBinarySequence("ATRGC") + self.assertEqual(len(bs), 5) + polys = bs.getPolymorphisms() + self.assertEqual(len(polys), 1) + self.assertEqual(polys[0][0], 2) # R is at position 2 + + def test_default_sequence_with_polymorphism(self): + bs = NucBinarySequence("ARC") + default = bs.getDefaultSequence() + self.assertEqual(len(default), 3) + # Default takes last allele of polymorphism + self.assertIn(default[1], "AG") + + +class TestNucBinarySequenceFind(unittest.TestCase): + + def test_find_exact(self): + bs = NucBinarySequence("ATGATGATG") + pos = bs.find("ATG") + self.assertEqual(pos, 0) + + def test_find_middle(self): + bs = NucBinarySequence("CCATGCC") + pos = bs.find("ATG") + self.assertEqual(pos, 2) + + def test_find_not_found(self): + bs = NucBinarySequence("AAAA") + pos = bs.find("GGG") + self.assertEqual(pos, -1) + + def test_find_all(self): + bs = NucBinarySequence("ATGATGATG") + positions = bs.findAll("ATG") + self.assertEqual(positions, [0, 3, 6]) + + def test_find_all_no_match(self): + bs = NucBinarySequence("AAAA") + positions = bs.findAll("GGG") + self.assertEqual(positions, []) + + +class TestNucBinarySequenceVariants(unittest.TestCase): + + def test_no_polymorphism_variants(self): + bs = NucBinarySequence("ATCG") + stopped, variants = bs.getSequenceVariants() + self.assertFalse(stopped) + self.assertEqual(variants, ["ATCG"]) + + def test_single_polymorphism_variants(self): + bs = NucBinarySequence("ARC") # R = A/G + stopped, variants = bs.getSequenceVariants() + self.assertFalse(stopped) + self.assertEqual(sorted(variants), sorted(["AAC", "AGC"])) + + def test_n_polymorphism_variants(self): + bs = NucBinarySequence("N") # 4 variants + stopped, variants = bs.getSequenceVariants() + self.assertFalse(stopped) + self.assertEqual(len(variants), 4) + + def test_max_variant_limit(self): + # NN = 16 variants, set limit to 4 + bs = NucBinarySequence("NN") + stopped, variants = bs.getSequenceVariants(maxVariantNumber=4) + self.assertTrue(stopped) + + def test_nb_variants(self): + bs = NucBinarySequence("ARN") # R=2 * N=4 = 8 + self.assertEqual(bs.getNbVariants(0), 8) + + def test_nb_variants_no_poly(self): + bs = NucBinarySequence("ATCG") + self.assertEqual(bs.getNbVariants(0), 1) + + +class TestNucBinarySequenceLen(unittest.TestCase): + + def test_length(self): + bs = NucBinarySequence("ATCG") + self.assertEqual(len(bs), 4) + + def test_length_with_polymorphism(self): + bs = NucBinarySequence("ARCG") # R expands but sequence is still 4 + self.assertEqual(len(bs), 4) + + +class TestNucBinarySequenceFindPolymorphisms(unittest.TestCase): + + def test_identical_no_polymorphisms(self): + bs = NucBinarySequence("ATCG") + result = bs.findPolymorphisms("ATCG") + self.assertEqual(result, []) + + def test_single_mismatch(self): + bs = NucBinarySequence("ATCG") + result = bs.findPolymorphisms("AACG") + self.assertEqual(result, [1]) + + +class TestAABinarySequence(unittest.TestCase): + + def test_simple_sequence(self): + bs = AABinarySequence("MLPK") + self.assertEqual(len(bs), 4) + self.assertEqual(bs.getDefaultSequence(), "MLPK") + + def test_find_in_protein(self): + bs = AABinarySequence("MLPKADEW") + pos = bs.find("ADE") + self.assertEqual(pos, 4) + + def test_polymorphic_protein(self): + bs = AABinarySequence("M/LPK") + polys = bs.getPolymorphisms() + self.assertEqual(len(polys), 1) + self.assertIn("M", polys[0][1]) + self.assertIn("L", polys[0][1]) + + +class TestBinarySequenceGetItem(unittest.TestCase): + + def test_getitem(self): + bs = NucBinarySequence("ATCG") + # Each element should be an integer + for i in range(4): + self.assertIsInstance(bs[i], int) + + def test_setitem(self): + bs = NucBinarySequence("ATCG") + original = bs[0] + bs[0] = 0 + self.assertEqual(bs[0], 0) + bs[0] = original + + def test_getchar(self): + bs = NucBinarySequence("ATCG") + self.assertEqual(bs.getChar(0), "A") + self.assertEqual(bs.getChar(1), "T") + self.assertEqual(bs.getChar(2), "C") + self.assertEqual(bs.getChar(3), "G") + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_configuration.py b/pyGeno/tests/test_configuration.py new file mode 100644 index 0000000..fb77adb --- /dev/null +++ b/pyGeno/tests/test_configuration.py @@ -0,0 +1,56 @@ +import unittest +import sys +import os +from unittest.mock import MagicMock +from configparser import ConfigParser + +# Mock rabaDB before importing configuration +if 'rabaDB' not in sys.modules: + sys.modules['rabaDB'] = MagicMock() + sys.modules['rabaDB.rabaSetup'] = MagicMock() + sys.modules['rabaDB.Raba'] = MagicMock() + +from pyGeno.configuration import checkPythonVersion, version, prettyVersion + + +class TestConfigParserCompat(unittest.TestCase): + """Verify that ConfigParser (not SafeConfigParser) works correctly.""" + + def test_configparser_read_write(self): + parser = ConfigParser() + parser.add_section("test_section") + parser.set("test_section", "key", "value") + self.assertEqual(parser.get("test_section", "key"), "value") + + def test_configparser_items(self): + parser = ConfigParser() + parser.add_section("section") + parser.set("section", "a", "1") + parser.set("section", "b", "2") + items = dict(parser.items("section")) + self.assertEqual(items["a"], "1") + self.assertEqual(items["b"], "2") + + +class TestPythonVersionCheck(unittest.TestCase): + + def test_python_version_is_valid(self): + result = checkPythonVersion() + self.assertTrue(result) + + +class TestVersionFunctions(unittest.TestCase): + + def test_version_returns_tuple(self): + v = version() + self.assertIsInstance(v, tuple) + self.assertEqual(len(v), 6) + + def test_pretty_version_returns_string(self): + pv = prettyVersion() + self.assertIsInstance(pv, str) + self.assertIn("pyGeno", pv) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_csv_tools.py b/pyGeno/tests/test_csv_tools.py new file mode 100644 index 0000000..6f847ec --- /dev/null +++ b/pyGeno/tests/test_csv_tools.py @@ -0,0 +1,174 @@ +import unittest +import os +import sys +import tempfile + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.parsers.CSVTools import CSVFile, CSVEntry + + +class TestCSVFileCreation(unittest.TestCase): + + def test_create_with_legend(self): + c = CSVFile(legend=["col1", "col2"]) + self.assertIn("col1", c.legend) + self.assertIn("col2", c.legend) + + def test_create_empty(self): + c = CSVFile() + self.assertEqual(len(c.legend), 0) + + def test_duplicate_legend_raises(self): + with self.assertRaises(ValueError): + CSVFile(legend=["col1", "col1"]) + + def test_custom_separator(self): + c = CSVFile(separator="\t") + self.assertEqual(c.separator, "\t") + + +class TestCSVFileNewLine(unittest.TestCase): + + def test_new_line(self): + c = CSVFile(legend=["name", "value"]) + line = c.newLine() + self.assertIsInstance(line, CSVEntry) + + def test_set_and_get_fields(self): + c = CSVFile(legend=["name", "value"]) + line = c.newLine() + line["name"] = "test" + line["value"] = "42" + self.assertEqual(line["name"], "test") + self.assertEqual(line["value"], "42") + + def test_case_insensitive_access(self): + c = CSVFile(legend=["Name", "Value"]) + line = c.newLine() + line["name"] = "test" + self.assertEqual(line["Name"], "test") + self.assertEqual(line["name"], "test") + + def test_nonexistent_column_raises(self): + c = CSVFile(legend=["name"]) + line = c.newLine() + with self.assertRaises(KeyError): + _ = line["nonexistent"] + + +class TestCSVFileSaveAndParse(unittest.TestCase): + + def setUp(self): + self.tmpfile = tempfile.mktemp(suffix=".csv") + + def tearDown(self): + if os.path.exists(self.tmpfile): + os.remove(self.tmpfile) + + def test_save_and_parse_roundtrip(self): + c = CSVFile(legend=["col1", "col2"], separator="\t") + l1 = c.newLine() + l1["col1"] = "hello" + l1["col2"] = "world" + l2 = c.newLine() + l2["col1"] = "foo" + l2["col2"] = "bar" + c.save(self.tmpfile) + + c2 = CSVFile() + c2.parse(self.tmpfile, separator="\t") + lines = list(c2) + self.assertEqual(len(lines), 2) + self.assertEqual(lines[0]["col1"], "hello") + self.assertEqual(lines[0]["col2"], "world") + self.assertEqual(lines[1]["col1"], "foo") + self.assertEqual(lines[1]["col2"], "bar") + + def test_save_comma_separated(self): + c = CSVFile(legend=["a", "b"], separator=",") + line = c.newLine() + line["a"] = "x" + line["b"] = "y" + c.save(self.tmpfile) + + c2 = CSVFile() + c2.parse(self.tmpfile, separator=",") + self.assertEqual(list(c2)[0]["a"], "x") + + def test_len(self): + c = CSVFile(legend=["col1"]) + c.newLine() + c.newLine() + c.newLine() + self.assertEqual(len(c), 3) + + +class TestCSVFileIteration(unittest.TestCase): + + def test_iteration(self): + c = CSVFile(legend=["name"]) + for i in range(5): + line = c.newLine() + line["name"] = str(i) + + names = [line["name"] for line in c] + self.assertEqual(names, ["0", "1", "2", "3", "4"]) + + def test_getitem(self): + c = CSVFile(legend=["name"]) + line = c.newLine() + line["name"] = "test" + self.assertEqual(c[0]["name"], "test") + + +class TestCSVEntryIteration(unittest.TestCase): + + def test_entry_iteration(self): + c = CSVFile(legend=["name", "email"]) + line = c.newLine() + line["name"] = "alice" + line["email"] = "alice@test.com" + fields = dict(line) + self.assertEqual(fields["name"], "alice") + self.assertEqual(fields["email"], "alice@test.com") + + +class TestCSVFileAddField(unittest.TestCase): + + def test_add_field(self): + c = CSVFile(legend=["col1"]) + c.addField("col2") + self.assertIn("col2", c.legend) + + def test_add_duplicate_field_raises(self): + c = CSVFile(legend=["col1"]) + with self.assertRaises(ValueError): + c.addField("col1") + + +class TestCSVFileStreamToFile(unittest.TestCase): + + def setUp(self): + self.tmpfile = tempfile.mktemp(suffix=".csv") + + def tearDown(self): + if os.path.exists(self.tmpfile): + os.remove(self.tmpfile) + + def test_stream_requires_legend(self): + c = CSVFile() + with self.assertRaises(ValueError): + c.streamToFile(self.tmpfile) + + def test_commit_without_stream_raises(self): + c = CSVFile(legend=["col1"]) + line = c.newLine() + with self.assertRaises(ValueError): + line.commit() + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_fasta_tools.py b/pyGeno/tests/test_fasta_tools.py new file mode 100644 index 0000000..6b88c8b --- /dev/null +++ b/pyGeno/tests/test_fasta_tools.py @@ -0,0 +1,145 @@ +import unittest +import os +import sys +import tempfile + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.parsers.FastaTools import FastaFile + + +class TestFastaFileCreation(unittest.TestCase): + + def test_create_empty(self): + f = FastaFile() + self.assertEqual(len(f), 0) + + def test_add_entry(self): + f = FastaFile() + f.add(">seq1", "ATCGATCG") + self.assertEqual(len(f), 1) + + def test_add_entry_without_gt(self): + f = FastaFile() + f.add("seq1", "ATCGATCG") + entry = f.get(0) + self.assertTrue(entry[0].startswith(">")) + + def test_add_multiple(self): + f = FastaFile() + f.add(">seq1", "AAAA") + f.add(">seq2", "CCCC") + f.add(">seq3", "GGGG") + self.assertEqual(len(f), 3) + + +class TestFastaFileGet(unittest.TestCase): + + def test_get_entry(self): + f = FastaFile() + f.add(">seq1", "ATCG") + entry = f.get(0) + self.assertEqual(entry[0], ">seq1") + self.assertEqual(entry[1], "ATCG") + + def test_getitem(self): + f = FastaFile() + f.add(">seq1", "ATCG") + entry = f[0] + self.assertEqual(entry[1], "ATCG") + + +class TestFastaFileParseStr(unittest.TestCase): + + def test_parse_simple(self): + f = FastaFile() + f.parseStr(">seq1\nATCG\n>seq2\nGGCC\n") + self.assertEqual(len(f), 2) + entry0 = f.get(0) + self.assertEqual(entry0[0], ">seq1") + self.assertEqual(entry0[1], "ATCG") + + def test_parse_multiline_sequence(self): + f = FastaFile() + f.parseStr(">seq1\nATCG\nGGCC\n") + entry = f.get(0) + self.assertEqual(entry[1], "ATCGGGCC") + + +class TestFastaFileSaveAndParse(unittest.TestCase): + + def setUp(self): + self.tmpfile = tempfile.mktemp(suffix=".fasta") + + def tearDown(self): + if os.path.exists(self.tmpfile): + os.remove(self.tmpfile) + + def test_save_and_parse_roundtrip(self): + f = FastaFile() + f.add(">gene1", "ATGATGATG") + f.add(">gene2", "CCCGGGAAA") + f.save(self.tmpfile) + + f2 = FastaFile(self.tmpfile) + self.assertEqual(len(f2), 2) + self.assertEqual(f2.get(0)[0], ">gene1") + self.assertEqual(f2.get(0)[1], "ATGATGATG") + self.assertEqual(f2.get(1)[0], ">gene2") + self.assertEqual(f2.get(1)[1], "CCCGGGAAA") + + +class TestFastaFileIteration(unittest.TestCase): + + def test_iteration(self): + f = FastaFile() + f.add(">s1", "AAA") + f.add(">s2", "CCC") + f.add(">s3", "GGG") + entries = list(f) + self.assertEqual(len(entries), 3) + self.assertEqual(entries[0][1], "AAA") + self.assertEqual(entries[2][1], "GGG") + + +class TestFastaFileToStr(unittest.TestCase): + + def test_to_str(self): + f = FastaFile() + f.add(">seq1", "ATCG") + s = f.toStr() + self.assertIn(">seq1", s) + self.assertIn("ATCG", s) + + +class TestFastaFileSetItem(unittest.TestCase): + + def test_setitem(self): + f = FastaFile() + f.add(">seq1", "ATCG") + f[0] = (">newseq", "GGGG") + self.assertEqual(f[0][0], ">newseq") + self.assertEqual(f[0][1], "GGGG") + + def test_setitem_wrong_length_raises(self): + f = FastaFile() + f.add(">seq1", "ATCG") + with self.assertRaises(TypeError): + f[0] = ("only_one_element",) + + +class TestFastaFileReset(unittest.TestCase): + + def test_reset(self): + f = FastaFile() + f.add(">s1", "AAA") + f.add(">s2", "CCC") + self.assertEqual(len(f), 2) + f.reset() + self.assertEqual(len(f), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_fastq_tools.py b/pyGeno/tests/test_fastq_tools.py new file mode 100644 index 0000000..a9c2fb3 --- /dev/null +++ b/pyGeno/tests/test_fastq_tools.py @@ -0,0 +1,123 @@ +import unittest +import os +import sys +import tempfile + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.parsers.FastqTools import FastqFile, FastqEntry + + +class TestFastqEntry(unittest.TestCase): + + def test_create_entry(self): + e = FastqEntry("@id", "ATCG", "+", "IIII") + self.assertEqual(e["identifier"], "@id") + self.assertEqual(e["sequence"], "ATCG") + self.assertEqual(e["+"], "+") + self.assertEqual(e["qualities"], "IIII") + + def test_set_entry(self): + e = FastqEntry() + e["identifier"] = "@test" + e["sequence"] = "GGGG" + self.assertEqual(e["identifier"], "@test") + self.assertEqual(e["sequence"], "GGGG") + + def test_str_format(self): + e = FastqEntry("@id", "ATCG", "+", "IIII") + s = str(e) + self.assertIn("@id", s) + self.assertIn("ATCG", s) + self.assertIn("IIII", s) + + +class TestFastqFileParseStr(unittest.TestCase): + + def test_parse_single_entry(self): + data = "@SEQ_ID\nATCGATCG\n+\nIIIIIIII\n" + f = FastqFile() + f.parseStr(data) + self.assertEqual(len(f), 1) + + def test_parse_multiple_entries(self): + data = "@SEQ1\nATCG\n+\nIIII\n@SEQ2\nGGGG\n+\nJJJJ\n" + f = FastqFile() + f.parseStr(data) + self.assertEqual(len(f), 2) + + def test_len_uses_integer_division(self): + # Verifies the // fix: len should always return int + data = "@SEQ1\nATCG\n+\nIIII\n" + f = FastqFile() + f.parseStr(data) + length = len(f) + self.assertIsInstance(length, int) + self.assertEqual(length, 1) + + +class TestFastqFileGetEntry(unittest.TestCase): + + def test_get_entry(self): + data = "@SEQ1\nATCG\n+\nIIII\n@SEQ2\nGGGG\n+\nJJJJ\n" + f = FastqFile() + f.parseStr(data) + e = f.get(0) + self.assertEqual(e["identifier"], "@SEQ1") + self.assertEqual(e["sequence"], "ATCG") + + def test_getitem(self): + data = "@SEQ1\nATCG\n+\nIIII\n" + f = FastqFile() + f.parseStr(data) + self.assertEqual(f[0]["sequence"], "ATCG") + + +class TestFastqFileNewEntry(unittest.TestCase): + + def test_new_entry_returns_fastq_entry(self): + f = FastqFile() + e = f.newEntry() + self.assertIsInstance(e, FastqEntry) + + def test_add_entry(self): + f = FastqFile() + e = FastqEntry("@id", "AAAA", "+", "FFFF") + f.add(e) + self.assertGreater(len(f.data), 0) + + +class TestFastqFileSave(unittest.TestCase): + + def setUp(self): + self.tmpfile = tempfile.mktemp(suffix=".fastq") + + def tearDown(self): + if os.path.exists(self.tmpfile): + os.remove(self.tmpfile) + + def test_parse_then_write_manually(self): + """Write parsed data back to file manually since save() relies on missing make() method.""" + f = FastqFile() + f.parseStr("@SEQ1\nATCG\n+\nIIII\n@SEQ2\nGGGG\n+\nJJJJ\n") + with open(self.tmpfile, 'w') as fh: + for i in range(len(f)): + entry = f.get(i) + fh.write(str(entry) + '\n') + self.assertTrue(os.path.exists(self.tmpfile)) + + +class TestFastqFileReset(unittest.TestCase): + + def test_reset(self): + f = FastqFile() + f.parseStr("@SEQ1\nATCG\n+\nIIII\n") + self.assertEqual(len(f), 1) + f.reset() + self.assertEqual(len(f), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_genome.py b/pyGeno/tests/test_genome.py index 2ee4051..4b2848c 100644 --- a/pyGeno/tests/test_genome.py +++ b/pyGeno/tests/test_genome.py @@ -171,21 +171,21 @@ def runTests() : except KeyError : deleteGenome("human", "GRCh37.75_Y-Only") B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") - print "--> Seems to already exist in db" + print("--> Seems to already exist in db") try : B.importSNPs("Human_agnostic.dummySRY.tar.gz") except KeyError : deleteSNPs("dummySRY_AGN") B.importSNPs("Human_agnostic.dummySRY.tar.gz") - print "--> Seems to already exist in db" + print("--> Seems to already exist in db") try : B.importSNPs("Human_agnostic.dummySRY_indels") except KeyError : deleteSNPs("dummySRY_AGN_indels") B.importSNPs("Human_agnostic.dummySRY_indels") - print "--> Seems to already exist in db" + print("--> Seems to already exist in db") # import time # time.sleep(10) unittest.main() diff --git a/pyGeno/tests/test_segment_tree.py b/pyGeno/tests/test_segment_tree.py new file mode 100644 index 0000000..cf254bb --- /dev/null +++ b/pyGeno/tests/test_segment_tree.py @@ -0,0 +1,199 @@ +import unittest +import sys +import os + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.SegmentTree import SegmentTree + + +class TestSegmentTreeInit(unittest.TestCase): + + def test_create_with_coordinates(self): + st = SegmentTree(0, 10) + self.assertEqual(st.x1, 0) + self.assertEqual(st.x2, 10) + + def test_create_with_none(self): + st = SegmentTree() + self.assertIsNone(st.x1) + self.assertIsNone(st.x2) + + def test_create_swaps_if_x1_gt_x2(self): + st = SegmentTree(10, 0) + self.assertEqual(st.x1, 0) + self.assertEqual(st.x2, 10) + + def test_create_with_name(self): + st = SegmentTree(0, 10, name="root") + self.assertEqual(st.name, "root") + + def test_children_initially_empty(self): + st = SegmentTree(0, 10) + self.assertEqual(st.children, []) + + +class TestSegmentTreeInsert(unittest.TestCase): + + def test_insert_child(self): + root = SegmentTree(0, 100) + child = root.insert(10, 50, name="child1") + self.assertEqual(child.x1, 10) + self.assertEqual(child.x2, 50) + self.assertEqual(len(root.children), 1) + + def test_insert_multiple_children(self): + root = SegmentTree(0, 100) + root.insert(10, 30) + root.insert(40, 60) + root.insert(70, 90) + self.assertEqual(len(root.children), 3) + + def test_insert_nested_child(self): + root = SegmentTree(0, 100) + root.insert(10, 50) + root.insert(20, 30) + # 20-30 should be nested inside 10-50 + self.assertEqual(len(root.children), 1) + self.assertEqual(len(root.children[0].children), 1) + self.assertEqual(root.children[0].children[0].x1, 20) + + def test_insert_duplicate_merges(self): + root = SegmentTree(0, 100) + root.insert(10, 50, name="first") + root.insert(10, 50, name="second") + self.assertEqual(len(root.children), 1) + self.assertIn("first", root.children[0].name) + self.assertIn("second", root.children[0].name) + + def test_insert_swaps_x1_x2(self): + root = SegmentTree(0, 100) + child = root.insert(50, 10, name="swapped") + self.assertEqual(child.x1, 10) + self.assertEqual(child.x2, 50) + + def test_insert_with_refered_object(self): + root = SegmentTree(0, 100) + child = root.insert(10, 50, referedObject="gene1") + self.assertEqual(child.referedObject, ["gene1"]) + + +class TestSegmentTreeIntersect(unittest.TestCase): + + def test_intersect_point(self): + root = SegmentTree(0, 100) + root.insert(10, 50, name="seg1") + root.insert(60, 90, name="seg2") + result = root.intersect(25) + names = [r.name for r in result] + self.assertTrue(any("seg1" in n for n in names)) + + def test_intersect_range(self): + root = SegmentTree(0, 100) + root.insert(10, 50, name="seg1") + root.insert(60, 90, name="seg2") + result = root.intersect(45, 65) + names = [r.name for r in result] + self.assertTrue(any("seg1" in n for n in names)) + self.assertTrue(any("seg2" in n for n in names)) + + def test_intersect_no_match(self): + root = SegmentTree(0, 100) + root.insert(10, 20) + root.insert(30, 40) + # Point between segments + result = root.intersect(25) + self.assertEqual(result, []) + + def test_intersect_empty_tree(self): + root = SegmentTree(0, 100) + result = root.intersect(50) + self.assertEqual(result, []) + + +class TestSegmentTreeBounds(unittest.TestCase): + + def test_getX1_with_coordinates(self): + st = SegmentTree(5, 10) + self.assertEqual(st.getX1(), 5) + + def test_getX2_with_coordinates(self): + st = SegmentTree(5, 10) + self.assertEqual(st.getX2(), 10) + + def test_getX1_from_children(self): + root = SegmentTree() + root.insert(10, 50) + root.insert(60, 90) + self.assertEqual(root.getX1(), 10) + + def test_getX2_from_children(self): + root = SegmentTree() + root.insert(10, 50) + root.insert(60, 90) + self.assertEqual(root.getX2(), 90) + + +class TestSegmentTreeIndexedLength(unittest.TestCase): + + def test_length_with_coordinates(self): + st = SegmentTree(0, 100) + self.assertEqual(st.getIndexedLength(), 100) + + def test_length_no_children_no_coords(self): + st = SegmentTree() + self.assertEqual(st.getIndexedLength(), 0) + + def test_length_from_children(self): + root = SegmentTree() + root.insert(0, 10) + root.insert(20, 30) + self.assertEqual(root.getIndexedLength(), 20) + + +class TestSegmentTreeFirstLevel(unittest.TestCase): + + def test_first_level(self): + root = SegmentTree(0, 100) + root.insert(10, 30) + root.insert(50, 70) + result = root.getFirstLevel() + self.assertEqual(result, [(10, 30), (50, 70)]) + + def test_first_level_no_children(self): + st = SegmentTree(0, 100) + self.assertEqual(st.getFirstLevel(), [(0, 100)]) + + def test_first_level_none_root(self): + st = SegmentTree() + self.assertIsNone(st.getFirstLevel()) + + +class TestSegmentTreeEmptyChildren(unittest.TestCase): + + def test_empty_children(self): + root = SegmentTree(0, 100) + root.insert(10, 50) + root.insert(60, 90) + self.assertEqual(len(root.children), 2) + root.emptyChildren() + self.assertEqual(len(root.children), 0) + + +class TestSegmentTreeDichotomicSearch(unittest.TestCase): + """Test that integer division fix in __dichotomicSearch works correctly.""" + + def test_search_with_many_children(self): + root = SegmentTree(0, 1000) + for i in range(0, 100, 10): + root.insert(i, i + 5, name=f"seg_{i}") + # If dichotomicSearch used float division, intersect would fail with TypeError + result = root.intersect(22) + names = [r.name for r in result] + self.assertTrue(any("seg_20" in n for n in names)) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_snp_filtering.py b/pyGeno/tests/test_snp_filtering.py new file mode 100644 index 0000000..b3a8336 --- /dev/null +++ b/pyGeno/tests/test_snp_filtering.py @@ -0,0 +1,147 @@ +import unittest +import sys +import os +from unittest.mock import MagicMock + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +# Mock rabaDB before importing SNPFiltering (which imports configuration -> rabaDB) +if 'rabaDB' not in sys.modules: + sys.modules['rabaDB'] = MagicMock() + sys.modules['rabaDB.rabaSetup'] = MagicMock() + sys.modules['rabaDB.Raba'] = MagicMock() + +from pyGeno.SNPFiltering import ( + SequenceSNP, SequenceInsert, SequenceDel, + SNPFilter, DefaultSNPFilter, + Sequence_modifiers, +) + + +class TestSequenceModifiers(unittest.TestCase): + + def test_init_default_sources(self): + sm = Sequence_modifiers() + self.assertIsInstance(sm.sources, dict) + + def test_add_source(self): + sm = Sequence_modifiers(sources={}) + sm.addSource("snp1", "some_data") + self.assertEqual(sm.sources["snp1"], "some_data") + + +class TestSequenceSNP(unittest.TestCase): + + def test_single_allele(self): + snp = SequenceSNP("A") + self.assertEqual(snp.alleles, "A") + + def test_multiple_alleles_string(self): + snp = SequenceSNP("AG") + self.assertEqual(snp.alleles, "R") + + def test_multiple_alleles_list(self): + snp = SequenceSNP(["A", "G"]) + self.assertEqual(snp.alleles, "R") + + def test_ct_alleles(self): + snp = SequenceSNP("CT") + self.assertEqual(snp.alleles, "Y") + + def test_all_four_alleles(self): + snp = SequenceSNP("ACGT") + self.assertEqual(snp.alleles, "N") + + def test_sources_passed(self): + snp = SequenceSNP("A", sources={"src": "val"}) + self.assertEqual(snp.sources["src"], "val") + + +class TestSequenceInsert(unittest.TestCase): + + def test_basic_insert(self): + ins = SequenceInsert("ACTG") + self.assertEqual(ins.bases, "ACTG") + self.assertEqual(ins.offset, 0) + + def test_insert_with_ref_prefix(self): + # Format like C/CCTGGAA (dbSNP style) + ins = SequenceInsert("CCTGGAA", ref="C") + self.assertEqual(ins.bases, "CTGGAA") + self.assertEqual(ins.offset, 0) # offset = len(ref) - 1 = 0 + + def test_insert_with_longer_ref_prefix(self): + # Format like CCT/CCTGGAA (samtools style) + ins = SequenceInsert("CCTGGAA", ref="CCT") + self.assertEqual(ins.bases, "GGAA") + self.assertEqual(ins.offset, 2) # offset = len(ref) - 1 = 2 + + def test_insert_bad_ref_raises(self): + with self.assertRaises(NotImplementedError): + SequenceInsert("CCTGGAA", ref="GGG") + + def test_insert_default_ref(self): + ins = SequenceInsert("XXX") + self.assertEqual(ins.bases, "XXX") + self.assertEqual(ins.offset, 0) + + +class TestSequenceDel(unittest.TestCase): + + def test_basic_deletion(self): + d = SequenceDel(5) + self.assertEqual(d.length, 5) + self.assertEqual(d.offset, 0) + + def test_deletion_with_alt_prefix(self): + # Format like CCTGGAA/C (dbSNP style) + d = SequenceDel(7, ref="CCTGGAA", alt="C") + self.assertEqual(d.offset, 1) + self.assertEqual(d.length, 6) + + def test_deletion_bad_alt_raises(self): + with self.assertRaises(NotImplementedError): + SequenceDel(7, ref="CCTGGAA", alt="GGG") + + def test_deletion_alt_without_ref_raises(self): + with self.assertRaises(Exception): + SequenceDel(5, alt="C") + + def test_deletion_default_alt(self): + d = SequenceDel(3) + self.assertEqual(d.length, 3) + self.assertEqual(d.offset, 0) + + +class TestSNPFilter(unittest.TestCase): + + def test_abstract_filter_raises(self): + f = SNPFilter() + with self.assertRaises(NotImplementedError): + f.filter(None) + + def test_default_filter_instantiation(self): + f = DefaultSNPFilter() + self.assertIsInstance(f, SNPFilter) + + +class TestPython314Compatibility(unittest.TestCase): + """Tests specifically verifying Python 3.14 compatibility fixes.""" + + def test_not_implemented_error_is_raised_not_not_implemented(self): + """Verify NotImplementedError (exception) is raised, not NotImplemented (constant).""" + with self.assertRaises(NotImplementedError): + SequenceInsert("CCTGGAA", ref="GGG") + + with self.assertRaises(NotImplementedError): + SequenceDel(7, ref="CCTGGAA", alt="GGG") + + f = SNPFilter() + with self.assertRaises(NotImplementedError): + f.filter(None) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_stats.py b/pyGeno/tests/test_stats.py new file mode 100644 index 0000000..336dcd5 --- /dev/null +++ b/pyGeno/tests/test_stats.py @@ -0,0 +1,102 @@ +import unittest +import sys +import os + +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +import numpy as np +from tools.Stats import kullback_leibler, squaredError_log10, fisherExactTest + + +class TestKullbackLeibler(unittest.TestCase): + + def test_identical_distributions(self): + p = [0.25, 0.25, 0.25, 0.25] + q = [0.25, 0.25, 0.25, 0.25] + result = kullback_leibler(p, q) + self.assertAlmostEqual(result, 0.0, places=5) + + def test_different_distributions(self): + p = [0.5, 0.5] + q = [0.25, 0.75] + result = kullback_leibler(p, q) + self.assertGreater(result, 0) + + def test_kl_nonnegative(self): + p = [0.1, 0.9] + q = [0.5, 0.5] + result = kullback_leibler(p, q) + self.assertGreaterEqual(result, 0) + + def test_mismatched_shapes_raises(self): + p = [0.5, 0.5] + q = [0.25, 0.25, 0.5] + with self.assertRaises(ValueError): + kullback_leibler(p, q) + + def test_zero_in_p_handled(self): + p = [0.0, 1.0] + q = [0.5, 0.5] + result = kullback_leibler(p, q) + # log(1.0/0.5) * 1.0 = log(2) + self.assertAlmostEqual(result, np.log(2), places=5) + + def test_returns_float(self): + p = [0.5, 0.5] + q = [0.5, 0.5] + result = kullback_leibler(p, q) + self.assertIsInstance(float(result), float) + + +class TestSquaredErrorLog10(unittest.TestCase): + + def test_identical_arrays(self): + p = [1.0, 2.0, 3.0] + q = [1.0, 2.0, 3.0] + result = squaredError_log10(p, q) + # sum of (p-q)^2 = 0, log10(0) = -inf + self.assertEqual(result, float('-inf') - np.log(3)) + + def test_different_arrays(self): + p = [1.0, 2.0] + q = [2.0, 3.0] + result = squaredError_log10(p, q) + # (1-2)^2 + (2-3)^2 = 2, log10(2) - ln(2) + expected = np.log10(2) - np.log(2) + self.assertAlmostEqual(float(result), float(expected), places=5) + + def test_mismatched_shapes_raises(self): + p = [1.0, 2.0] + q = [1.0, 2.0, 3.0] + with self.assertRaises(ValueError): + squaredError_log10(p, q) + + +class TestFisherExact(unittest.TestCase): + + def test_not_implemented(self): + with self.assertRaises(NotImplementedError): + fisherExactTest([[1, 2], [3, 4]]) + + +class TestNumpyDtypeCompatibility(unittest.TestCase): + """Verify the np.float -> float fix works.""" + + def test_kl_uses_float_dtype(self): + p = [0.5, 0.5] + q = [0.5, 0.5] + # This would fail with np.float on NumPy >= 1.24 + result = kullback_leibler(p, q) + self.assertIsNotNone(result) + + def test_squared_error_uses_float_dtype(self): + p = [1.0, 2.0] + q = [1.0, 2.0] + result = squaredError_log10(p, q) + self.assertIsNotNone(result) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tests/test_useful_functions.py b/pyGeno/tests/test_useful_functions.py new file mode 100644 index 0000000..0dc28b2 --- /dev/null +++ b/pyGeno/tests/test_useful_functions.py @@ -0,0 +1,314 @@ +import unittest +import sys +import os + +# Add parent of pyGeno to path so we can import tools directly +_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _root not in sys.path: + sys.path.insert(0, _root) + +from tools.UsefulFunctions import ( + complement, reverseComplement, complementTab, reverseComplementTab, + translateDNA, translateDNA_6Frames, + findAll, + encodePolymorphicNucleotide, decodePolymorphicNucleotide, + decodePolymorphicNucleotide_str, + getSequenceCombinaisons, polymorphicCodonCombinaisons, + getNucleotideCodon, showDifferences, + nucleotides, polymorphicNucleotides, codonTable, translTable, + UnknownNucleotide, +) + + +class TestComplement(unittest.TestCase): + + def test_basic_complement(self): + self.assertEqual(complement("ATCG"), "TAGC") + + def test_complement_is_not_reversed(self): + self.assertEqual(complement("AAAT"), "TTTA") + + def test_complement_lowercase(self): + self.assertEqual(complement("atcg"), "tagc") + + def test_complement_mixed_case(self): + self.assertEqual(complement("AaTt"), "TtAa") + + def test_complement_iupac_codes(self): + self.assertEqual(complement("R"), "Y") + self.assertEqual(complement("Y"), "R") + self.assertEqual(complement("M"), "K") + self.assertEqual(complement("K"), "M") + self.assertEqual(complement("W"), "W") + self.assertEqual(complement("S"), "S") + self.assertEqual(complement("N"), "N") + + def test_complement_empty(self): + self.assertEqual(complement(""), "") + + +class TestReverseComplement(unittest.TestCase): + + def test_basic_rc(self): + self.assertEqual(reverseComplement("ATCG"), "CGAT") + + def test_rc_palindrome(self): + self.assertEqual(reverseComplement("AATT"), "AATT") + + def test_rc_single_base(self): + self.assertEqual(reverseComplement("A"), "T") + + def test_rc_empty(self): + self.assertEqual(reverseComplement(""), "") + + +class TestComplementTab(unittest.TestCase): + + def test_basic_list(self): + result = complementTab(["A", "T", "C", "G"]) + self.assertEqual(result, ["T", "A", "G", "C"]) + + def test_empty_string_in_list(self): + result = complementTab(["A", "", "G"]) + self.assertEqual(result, ["T", "", "C"]) + + def test_insertion_in_list(self): + result = complementTab(["ACT"]) + self.assertEqual(result, [reverseComplement("ACT")]) + + def test_empty_list(self): + self.assertEqual(complementTab([]), []) + + +class TestReverseComplementTab(unittest.TestCase): + + def test_basic(self): + result = reverseComplementTab(["A", "T", "C", "G"]) + self.assertEqual(result, ["C", "G", "A", "T"]) + + +class TestTranslateDNA(unittest.TestCase): + + def test_start_codon(self): + self.assertEqual(translateDNA("ATG"), "M") + + def test_stop_codon_TAA(self): + self.assertEqual(translateDNA("TAA"), "*") + + def test_stop_codon_TAG(self): + self.assertEqual(translateDNA("TAG"), "*") + + def test_stop_codon_TGA(self): + self.assertEqual(translateDNA("TGA"), "*") + + def test_multiple_codons(self): + self.assertEqual(translateDNA("ATGTTT"), "MF") + + def test_frame_f2(self): + # f2 skips first nucleotide + self.assertEqual(translateDNA("AATG", frame="f2"), "M") + + def test_frame_f3(self): + # f3 skips first two nucleotides + self.assertEqual(translateDNA("AAATG", frame="f3"), "M") + + def test_frame_r1(self): + # reverse complement then translate + seq = "CAT" # RC = ATG -> M + self.assertEqual(translateDNA(seq, frame="r1"), "M") + + def test_incomplete_codon_ignored(self): + # trailing 1-2 nucleotides are ignored + self.assertEqual(translateDNA("ATGA"), "M") + self.assertEqual(translateDNA("ATGAT"), "M") + + def test_unknown_frame_raises(self): + with self.assertRaises(ValueError): + translateDNA("ATG", frame="x9") + + def test_all_64_codons(self): + for codon, aa in codonTable.items(): + if '!' not in codon: + self.assertEqual(translateDNA(codon), aa) + + def test_mitochondrial_table(self): + # In mt table, AGA is a stop codon + self.assertEqual(translateDNA("AGA", translTable_id="mt"), "*") + # In mt table, TGA is W instead of stop + self.assertEqual(translateDNA("TGA", translTable_id="mt"), "W") + + +class TestTranslateDNA6Frames(unittest.TestCase): + + def test_returns_6_frames(self): + result = translateDNA_6Frames("ATGATGATG") + self.assertEqual(len(result), 6) + + def test_f1_matches_single_call(self): + seq = "ATGATGATG" + result = translateDNA_6Frames(seq) + self.assertEqual(result[0], translateDNA(seq, "f1")) + + +class TestFindAll(unittest.TestCase): + + def test_basic_find(self): + self.assertEqual(findAll("ATGATGATG", "ATG"), [0, 3, 6]) + + def test_no_match(self): + self.assertEqual(findAll("AAAA", "CC"), []) + + def test_single_match(self): + self.assertEqual(findAll("AATGA", "ATG"), [1]) + + def test_overlapping_not_found(self): + # findAll doesn't find overlapping matches + self.assertEqual(findAll("AAA", "AA"), [0]) + + def test_empty_haystack(self): + self.assertEqual(findAll("", "ATG"), []) + + +class TestEncodePolymorphicNucleotide(unittest.TestCase): + + def test_single_nucleotide(self): + self.assertEqual(encodePolymorphicNucleotide("A"), "A") + + def test_two_nucleotides_AG(self): + self.assertEqual(encodePolymorphicNucleotide("AG"), "R") + + def test_two_nucleotides_CT(self): + self.assertEqual(encodePolymorphicNucleotide("CT"), "Y") + + def test_two_nucleotides_AC(self): + self.assertEqual(encodePolymorphicNucleotide("AC"), "M") + + def test_two_nucleotides_TG(self): + self.assertEqual(encodePolymorphicNucleotide("TG"), "K") + + def test_two_nucleotides_AT(self): + self.assertEqual(encodePolymorphicNucleotide("AT"), "W") + + def test_two_nucleotides_CG(self): + self.assertEqual(encodePolymorphicNucleotide("CG"), "S") + + def test_three_nucleotides_CGT(self): + self.assertEqual(encodePolymorphicNucleotide("CGT"), "B") + + def test_three_nucleotides_AGT(self): + self.assertEqual(encodePolymorphicNucleotide("AGT"), "D") + + def test_three_nucleotides_ACT(self): + self.assertEqual(encodePolymorphicNucleotide("ACT"), "H") + + def test_three_nucleotides_ACG(self): + self.assertEqual(encodePolymorphicNucleotide("ACG"), "V") + + def test_four_nucleotides(self): + self.assertEqual(encodePolymorphicNucleotide("ACGT"), "N") + + def test_slash_separated(self): + self.assertEqual(encodePolymorphicNucleotide("A/G"), "R") + + def test_list_input(self): + self.assertEqual(encodePolymorphicNucleotide(["A", "G"]), "R") + + def test_iupac_input_expands(self): + # R = A/G, if passed R it should return R + self.assertEqual(encodePolymorphicNucleotide("R"), "R") + + +class TestDecodePolymorphicNucleotide(unittest.TestCase): + + def test_decode_R(self): + self.assertEqual(sorted(decodePolymorphicNucleotide("R")), ["A", "G"]) + + def test_decode_Y(self): + self.assertEqual(sorted(decodePolymorphicNucleotide("Y")), ["C", "T"]) + + def test_decode_N(self): + self.assertEqual(sorted(decodePolymorphicNucleotide("N")), ["A", "C", "G", "T"]) + + def test_decode_regular_nucleotide(self): + self.assertEqual(decodePolymorphicNucleotide("A"), "A") + + def test_decode_invalid_raises(self): + with self.assertRaises(ValueError): + decodePolymorphicNucleotide("X") + + def test_str_version(self): + self.assertEqual(decodePolymorphicNucleotide_str("R"), "A/G") + + +class TestGetSequenceCombinaisons(unittest.TestCase): + + def test_no_polymorphisms(self): + self.assertEqual(getSequenceCombinaisons("ATG"), ["ATG"]) + + def test_single_polymorphism(self): + result = sorted(getSequenceCombinaisons("ARG")) + self.assertEqual(result, sorted(["AAG", "AGG"])) + + def test_multiple_polymorphisms(self): + result = sorted(getSequenceCombinaisons("RY")) + expected = sorted(["AC", "AT", "GC", "GT"]) + self.assertEqual(result, expected) + + +class TestPolymorphicCodonCombinaisons(unittest.TestCase): + + def test_no_ambiguity(self): + self.assertEqual(polymorphicCodonCombinaisons(["A", "T", "G"]), ["ATG"]) + + def test_with_ambiguity(self): + result = sorted(polymorphicCodonCombinaisons(["R", "T", "G"])) + self.assertEqual(result, sorted(["ATG", "GTG"])) + + +class TestGetNucleotideCodon(unittest.TestCase): + + def test_first_codon_pos0(self): + codon, pos = getNucleotideCodon("ATGCCC", 0) + self.assertEqual(codon, "ATG") + self.assertEqual(pos, 0) + + def test_first_codon_pos1(self): + codon, pos = getNucleotideCodon("ATGCCC", 1) + self.assertEqual(codon, "ATG") + self.assertEqual(pos, 1) + + def test_first_codon_pos2(self): + codon, pos = getNucleotideCodon("ATGCCC", 2) + self.assertEqual(codon, "ATG") + self.assertEqual(pos, 2) + + def test_second_codon(self): + codon, pos = getNucleotideCodon("ATGCCC", 3) + self.assertEqual(codon, "CCC") + self.assertEqual(pos, 0) + + def test_out_of_range(self): + self.assertIsNone(getNucleotideCodon("ATG", 5)) + + def test_negative_index(self): + self.assertIsNone(getNucleotideCodon("ATG", -1)) + + +class TestShowDifferences(unittest.TestCase): + + def test_identical_sequences(self): + result = showDifferences("ATG", "ATG") + self.assertIn("-", result) + self.assertNotIn("|", result) + + def test_different_sequences(self): + result = showDifferences("ATG", "ATC") + self.assertIn("|", result) + + def test_different_lengths(self): + result = showDifferences("ATGC", "AT") + self.assertIn("#", result) + + +if __name__ == "__main__": + unittest.main() diff --git a/pyGeno/tools/BinarySequence.py b/pyGeno/tools/BinarySequence.py index 3aff9df..b7606be 100644 --- a/pyGeno/tools/BinarySequence.py +++ b/pyGeno/tools/BinarySequence.py @@ -1,12 +1,12 @@ import array, copy -import UsefulFunctions as uf +import icecream as ic class BinarySequence : """A class for representing sequences in a binary format""" - ALPHABETA_SIZE = 32 - ALPHABETA_KMP = range(ALPHABETA_SIZE) - + ALPHABETA_SIZE = 32 + ALPHABETA_KMP = range(ALPHABETA_SIZE) + def __init__(self, sequence, arrayForma, charToBinDict) : self.forma = arrayForma @@ -16,8 +16,7 @@ def __init__(self, sequence, arrayForma, charToBinDict) : self.binSequence, self.defaultSequence, self.polymorphisms = self.encode(sequence) self.itemsize = self.binSequence.itemsize self.typecode = self.binSequence.typecode - #print 'bin', len(self.sequence), len(self.binSequence) - + #print 'bin', len(self.sequence), len(self.binSequence) def encode(self, sequence): """Returns a tuple (binary reprensentation, default sequence, polymorphisms list)""" @@ -28,6 +27,11 @@ def encode(self, sequence): i = 0 trueI = 0 #not inc in case if poly poly = set() + + # Handle stop variant at the end + if sequence.endswith('/'): + sequence += '*' + while i < len(sequence)-1: b = b | self.forma[self.charToBin[sequence[i]]] if sequence[i+1] == '/' : @@ -40,9 +44,8 @@ def encode(self, sequence): polymorphisms.append((trueI, poly)) poly = set() - bb = 0 while b % 2 != 0 : - b = b/2 + b = b//2 defaultSequence += sequence[i] b = 0 @@ -110,7 +113,7 @@ def __getSequenceVariants(self, x1, polyStart, polyStop, listSequence) : pk = self.polymorphisms[polyStart] posInSequence = pk[0]-x1 if posInSequence < len(listSequence) : - for allele in pk[1] : + for allele in sorted(pk[1]): sequence[posInSequence] = allele ret.extend(self.__getSequenceVariants(x1, polyStart+1, polyStop, sequence)) @@ -169,7 +172,7 @@ def _dichFind(self, needle, currHaystack, offset, lst = None) : if len(currHaystack) == 1 : if (offset <= (len(self) - len(needle))) and (currHaystack[0] & needle[0]) > 0 and (self[offset+len(needle)-1] & needle[-1]) > 0 : found = True - for i in xrange(1, len(needle)-1) : + for i in range(1, len(needle)-1) : if self[offset + i] & needle[i] == 0 : found = False break @@ -184,59 +187,59 @@ def _dichFind(self, needle, currHaystack, offset, lst = None) : else : if (offset <= (len(self) - len(needle))) : if lst is not None : - self._dichFind(needle, currHaystack[:len(currHaystack)/2], offset, lst) - self._dichFind(needle, currHaystack[len(currHaystack)/2:], offset + len(currHaystack)/2, lst) + self._dichFind(needle, currHaystack[:len(currHaystack)//2], offset, lst) + self._dichFind(needle, currHaystack[len(currHaystack)//2:], offset + len(currHaystack)//2, lst) else : - v1 = self._dichFind(needle, currHaystack[:len(currHaystack)/2], offset, lst) + v1 = self._dichFind(needle, currHaystack[:len(currHaystack)//2], offset, lst) if v1 > -1 : return v1 - return self._dichFind(needle, currHaystack[len(currHaystack)/2:], offset + len(currHaystack)/2, lst) + return self._dichFind(needle, currHaystack[len(currHaystack)//2:], offset + len(currHaystack)//2, lst) return -1 - def _kmp_construct_next(self, pattern): - """the helper function for KMP-string-searching is to construct the DFA. pattern should be an integer array. return a 2D array representing the DFA for moving the pattern.""" - next = [[0 for state in pattern] for input_token in self.ALPHABETA_KMP] - next[pattern[0]][0] = 1 - restart_state = 0 - for state in range(1, len(pattern)): - for input_token in self.ALPHABETA_KMP: - next[input_token][state] = next[input_token][restart_state] - next[pattern[state]][state] = state + 1 - restart_state = next[pattern[state]][restart_state] - return next + def _kmp_construct_next(self, pattern): + """the helper function for KMP-string-searching is to construct the DFA. pattern should be an integer array. return a 2D array representing the DFA for moving the pattern.""" + next = [[0 for state in pattern] for input_token in self.ALPHABETA_KMP] + next[pattern[0]][0] = 1 + restart_state = 0 + for state in range(1, len(pattern)): + for input_token in self.ALPHABETA_KMP: + next[input_token][state] = next[input_token][restart_state] + next[pattern[state]][state] = state + 1 + restart_state = next[pattern[state]][restart_state] + return next - def _kmp_search_first(self, pInput_sequence, pPattern): - """use KMP algorithm to search the first occurrence in the input_sequence of the pattern. both arguments are integer arrays. return the position of the occurence if found; otherwise, -1.""" - input_sequence, pattern = pInput_sequence, [len(bin(e)) for e in pPattern] - n, m = len(input_sequence), len(pattern) - d = p = 0 - next = self._kmp_construct_next(pattern) - while d < n and p < m: - p = next[len(bin(input_sequence[d]))][p] - d += 1 - if p == m: return d - p - else: return -1 + def _kmp_search_first(self, pInput_sequence, pPattern): + """use KMP algorithm to search the first occurrence in the input_sequence of the pattern. both arguments are integer arrays. return the position of the occurence if found; otherwise, -1.""" + input_sequence, pattern = pInput_sequence, [len(bin(e)) for e in pPattern] + n, m = len(input_sequence), len(pattern) + d = p = 0 + next = self._kmp_construct_next(pattern) + while d < n and p < m: + p = next[len(bin(input_sequence[d]))][p] + d += 1 + if p == m: return d - p + else: return -1 - def _kmp_search_all(self, pInput_sequence, pPattern): - """use KMP algorithm to search all occurrence in the input_sequence of the pattern. both arguments are integer arrays. return a list of the positions of the occurences if found; otherwise, [].""" - r = [] - input_sequence, pattern = [len(bin(e)) for e in pInput_sequence], [len(bin(e)) for e in pPattern] - n, m = len(input_sequence), len(pattern) - d = p = 0 - next = self._kmp_construct_next(pattern) - while d < n: - p = next[input_sequence[d]][p] - d += 1 - if p == m: - r.append(d - m) - p = 0 - return r + def _kmp_search_all(self, pInput_sequence, pPattern): + """use KMP algorithm to search all occurrence in the input_sequence of the pattern. both arguments are integer arrays. return a list of the positions of the occurences if found; otherwise, [].""" + r = [] + input_sequence, pattern = [len(bin(e)) for e in pInput_sequence], [len(bin(e)) for e in pPattern] + n, m = len(input_sequence), len(pattern) + d = p = 0 + next = self._kmp_construct_next(pattern) + while d < n: + p = next[input_sequence[d]][p] + d += 1 + if p == m: + r.append(d - m) + p = 0 + return r - def _kmp_find(self, needle, haystack, lst = None): + def _kmp_find(self, needle, haystack, lst = None): """find with KMP-searching. needle is an integer array, reprensenting a pattern. haystack is an integer array, reprensenting the input sequence. if lst is None, return the first position found or -1 if no match found. If it's a list, will return a list of all positions in lst. returns -1 or [] if no match found.""" - if lst != None: return self._kmp_search_all(haystack, needle) - else: return self._kmp_search_first(haystack, needle) + if lst != None: return self._kmp_search_all(haystack, needle) + else: return self._kmp_search_first(haystack, needle) def findByBiSearch(self, strSeq) : """returns the first occurence of strSeq in self. Takes polymorphisms into account""" @@ -253,13 +256,13 @@ def findAllByBiSearch(self, strSeq) : def find(self, strSeq) : """returns the first occurence of strSeq in self. Takes polymorphisms into account""" arr = self.encode(strSeq) - return self._kmp_find(arr[0], self) + return self._kmp_find(arr[0], self) def findAll(self, strSeq) : """Same as find but returns a list of all occurences""" arr = self.encode(strSeq) lst = [] - lst = self._kmp_find(arr[0], self, lst) + lst = self._kmp_find(arr[0], self, lst) return lst def __and__(self, arr) : @@ -299,7 +302,7 @@ def decode(self, binSequence): """decodes a binary sequence to return a string""" try: binSeq = iter(binSequence[0]) - except TypeError, te: + except TypeError: binSeq = binSequence ret = '' @@ -330,7 +333,7 @@ class AABinarySequence(BinarySequence) : """A binary sequence of amino acids""" def __init__(self, sequence): - f = array.array('I', [1L, 2L, 4L, 8L, 16L, 32L, 64L, 128L, 256L, 512L, 1024L, 2048L, 4096L, 8192L, 16384L, 32768L, 65536L, 131072L, 262144L, 524288L, 1048576L, 2097152L]) + f = array.array('I', [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152]) c = {'A': 17, 'C': 14, 'E': 19, 'D': 15, 'G': 13, 'F': 16, 'I': 3, 'H': 9, 'K': 8, '*': 1, 'M': 20, 'L': 0, 'N': 4, 'Q': 11, 'P': 6, 'S': 7, 'R': 5, 'T': 2, 'W': 10, 'V': 18, 'Y': 12, 'U': 21} BinarySequence.__init__(self, sequence, f, c) @@ -346,7 +349,7 @@ def __init__(self, sequence): 'D' : 'A/G/T', 'H' : 'A/C/T', 'V' : 'A/C/G', 'N': 'A/C/G/T' } lstSeq = list(sequence) - for i in xrange(len(lstSeq)) : + for i in range(len(lstSeq)) : if lstSeq[i] in ce : lstSeq[i] = ce[lstSeq[i]] lstSeq = ''.join(lstSeq) @@ -364,56 +367,56 @@ def test0() : r = bSeq.getSequenceVariants(start, stop) #print start, stop, 'nb_comb_r', len(r[1]), set(rB[1])==set(r[1]) - print start, stop#, 'nb_comb_r', len(r[1]), set(rB[1])==set(r[1]) + print(start, stop)#, 'nb_comb_r', len(r[1]), set(rB[1])==set(r[1]) #if set(rB[1])!=set(r[1]) : - print '-AV-' - print start, stop, 'nb_comb_r', len(rB[1]) - print '\n'.join(rB[1]) - print '=AP========' - print start, stop, 'nb_comb_r', len(r[1]) - print '\n'.join(r[1]) + print('-AV-') + print(start, stop, 'nb_comb_r', len(rB[1])) + print('\n'.join(rB[1])) + print('=AP========') + print(start, stop, 'nb_comb_r', len(r[1])) + print('\n'.join(r[1])) def testVariants() : seq = 'ATGAGTTTGCCGCGCN' bSeq = NucBinarySequence(seq) - print bSeq.getSequenceVariants() + print(bSeq.getSequenceVariants()) testVariants() - from random import randint - alphabeta = ['A', 'C', 'G', 'T'] - seq = '' - for _ in range(8192): - seq += alphabeta[randint(0, 3)] - seq += 'ATGAGTTTGCCGCGCN' - bSeq = NucBinarySequence(seq) + from random import randint + alphabeta = ['A', 'C', 'G', 'T'] + seq = '' + for _ in range(8192): + seq += alphabeta[randint(0, 3)] + seq += 'ATGAGTTTGCCGCGCN' + bSeq = NucBinarySequence(seq) - ROUND = 512 - PATTERN = 'GCGC' + ROUND = 512 + PATTERN = 'GCGC' - def testFind(): - for i in range(ROUND): - bSeq.find(PATTERN) + def testFind(): + for i in range(ROUND): + bSeq.find(PATTERN) - def testFindByBiSearch(): - for i in range(ROUND): - bSeq.findByBiSearch(PATTERN) - - def testFindAll(): - for i in range(ROUND): - bSeq.findAll(PATTERN) + def testFindByBiSearch(): + for i in range(ROUND): + bSeq.findByBiSearch(PATTERN) + + def testFindAll(): + for i in range(ROUND): + bSeq.findAll(PATTERN) - def testFindAllByBiSearch(): - for i in range(ROUND): - bSeq.findAllByBiSearch(PATTERN) + def testFindAllByBiSearch(): + for i in range(ROUND): + bSeq.findAllByBiSearch(PATTERN) - import cProfile - print('find:\n') + import cProfile + print('find:\n') cProfile.run('testFind()') - print('findAll:\n') - cProfile.run('testFindAll()') - print('findByBiSearch:\n') - cProfile.run('testFindByBiSearch()') - print('findAllByBiSearch:\n') - cProfile.run('testFindAllByBiSearch()') + print('findAll:\n') + cProfile.run('testFindAll()') + print('findByBiSearch:\n') + cProfile.run('testFindByBiSearch()') + print('findAllByBiSearch:\n') + cProfile.run('testFindAllByBiSearch()') diff --git a/pyGeno/tools/ProgressBar.py b/pyGeno/tools/ProgressBar.py index 16667ad..e592b80 100644 --- a/pyGeno/tools/ProgressBar.py +++ b/pyGeno/tools/ProgressBar.py @@ -1,4 +1,4 @@ -import sys, time, cPickle +import sys, time, pickle class ProgressBar : """A very simple unthreaded progress bar. This progress bar also logs stats in .logs. @@ -68,7 +68,7 @@ def log(self) : def saveLogs(self, filename) : """dumps logs into a nice pickle""" f = open(filename, 'wb') - cPickle.dump(self.logs, f) + pickle.dump(self.logs, f) f.close() def update(self, label = '', forceRefresh = False, log = False) : @@ -116,11 +116,11 @@ def update(self, label = '', forceRefresh = False, log = False) : def close(self) : """Closes the bar so your next print will be on another line""" self.update(forceRefresh = True) - print '\n' + print('\n') if __name__ == "__main__" : p = ProgressBar(nbEpochs = 100000000000) - for i in xrange(100000000000) : + for i in range(100000000000) : p.update() #time.sleep(3) p.close() diff --git a/pyGeno/tools/SegmentTree.py b/pyGeno/tools/SegmentTree.py index 0296705..121b06f 100644 --- a/pyGeno/tools/SegmentTree.py +++ b/pyGeno/tools/SegmentTree.py @@ -1,4 +1,4 @@ -import random, copy, types +import random, copy def aux_insertTree(childTree, parentTree): """This a private (You shouldn't have to call it) recursive function that inserts a child tree into a parent tree.""" @@ -54,7 +54,7 @@ class SegmentTree : """ def __init__(self, x1 = None, x2 = None, name = '', referedObject = [], father = None, level = 0) : - if x1 > x2 : + if x1 is not None and x2 is not None and x1 > x2 : self.x1, self.x2 = x2, x1 else : self.x1, self.x2 = x1, x2 @@ -98,7 +98,7 @@ def insert(self, x1, x2, name = '', referedObject = []) : elif xx1 <= self.children[i].x1 and self.children[i].x2 <= xx2 : if rt == None : - if type(referedObject) is types.ListType : + if type(referedObject) is list : rt = SegmentTree(xx1, xx2, name, referedObject, self, self.level+1) else : rt = SegmentTree(xx1, xx2, name, [referedObject], self, self.level+1) @@ -118,7 +118,7 @@ def insert(self, x1, x2, name = '', referedObject = []) : for c in childrenToRemove : self.children.remove(c) else : - if type(referedObject) is types.ListType : + if type(referedObject) is list : rt = SegmentTree(xx1, xx2, name, referedObject, self, self.level+1) else : rt = SegmentTree(xx1, xx2, name, [referedObject], self, self.level+1) @@ -163,15 +163,15 @@ def condition(x1, x2, tree) : if xx1 < self.children[c1].x1 : c1 -= 1 - inter = self.__radiateDown(x1, x2, c1, condition) + inter = self.__radiateDown(xx1, xx2, c1, condition) if self.children[c1].id == self.children[c2].id : - inter.extend(self.__radiateUp(x1, x2, c2+1, condition)) + inter.extend(self.__radiateUp(xx1, xx2, c2+1, condition)) else : - inter.extend(self.__radiateUp(x1, x2, c2, condition)) + inter.extend(self.__radiateUp(xx1, xx2, c2, condition)) ret = [] for c in inter : - ret.extend(c.intersect(x1, x2)) + ret.extend(c.intersect(xx1, xx2)) inter.extend(ret) return inter @@ -181,7 +181,7 @@ def __dichotomicSearch(self, x1) : r2 = len(self.children)-1 pos = -1 while (r1 <= r2) : - pos = (r1+r2)/2 + pos = (r1+r2)//2 val = self.children[pos].x1 if val == x1 : @@ -350,19 +350,19 @@ def __repr__(self): s.insert(35, 38, 'region 5') s.insert(36, 37, 'region 6', 'aaa') s.insert(36, 37, 'region 6', 'aaa2') - print "Tree:" - print s - print "indexed length", s.getIndexedLength() - print "removing gaps and adding region 7 : [13-37[" + print("Tree:") + print(s) + print("indexed length", s.getIndexedLength()) + print("removing gaps and adding region 7 : [13-37[") s.removeGaps() #s.insert(13, 37, 'region 7') - print s - print "indexed length", s.getIndexedLength() + print(s) + print("indexed length", s.getIndexedLength()) #print "intersections" #for c in [6, 10, 14, 1000] : # print c, s.intersect(c) - print "Move" + print("Move") s.move(0) - print s - print "indexed length", s.getIndexedLength() + print(s) + print("indexed length", s.getIndexedLength()) diff --git a/pyGeno/tools/Stats.py b/pyGeno/tools/Stats.py index b1612f8..fbe46d5 100644 --- a/pyGeno/tools/Stats.py +++ b/pyGeno/tools/Stats.py @@ -2,8 +2,8 @@ def kullback_leibler(p, q) : """Discrete Kullback-Leibler divergence D(P||Q)""" - p = np.asarray(p, dtype=np.float) - q = np.asarray(q, dtype=np.float) + p = np.asarray(p, dtype=np.float32) + q = np.asarray(q, dtype=np.float32) if p.shape != q.shape : raise ValueError("p and q must be of the same dimensions") @@ -11,8 +11,8 @@ def kullback_leibler(p, q) : return np.sum(np.where(p > 0, np.log(p / q) * p, 0)) def squaredError_log10(p, q) : - p = np.asarray(p, dtype=np.float) - q = np.asarray(q, dtype=np.float) + p = np.asarray(p, dtype=np.float32) + q = np.asarray(q, dtype=np.float32) if p.shape != q.shape : raise ValueError("p and q must be of the same dimensions") diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py index 19c0618..a614d5e 100644 --- a/pyGeno/tools/UsefulFunctions.py +++ b/pyGeno/tools/UsefulFunctions.py @@ -1,4 +1,4 @@ -import string, os, copy, types +import os, copy class UnknownNucleotide(Exception) : def __init__(self, nuc) : @@ -15,7 +15,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''): resPath = "%s/%s"%(directoryName, fileName) resFile = open(resPath, 'w') - print "Saving results :\n\t%s..."%resPath + print("Saving results :\n\t%s..."%resPath) resFile.write(strResults) resFile.close() @@ -23,7 +23,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''): errPath = "%s.err.txt"%(resPath) errFile = open(errPath, 'w') - print "Saving log :\n\t%s..." %errPath + print("Saving log :\n\t%s..." %errPath) errFile.write(log) errFile.close() @@ -31,7 +31,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''): paramPath = "%s.args.txt"%(resPath) paramFile = open(paramPath, 'w') - print "Saving arguments :\n\t%s..." %paramPath + print("Saving arguments :\n\t%s..." %paramPath) paramFile.write(args) paramFile.close() @@ -187,8 +187,8 @@ def reverseComplement(seq): def complement(seq) : """returns the complementary sequence without inversing it""" - tb = string.maketrans("ACGTRYMKWSBDHVNacgtrymkwsbdhvn", - "TGCAYRKMWSVHDBNtgcayrkmwsvhdbn") + tb = str.maketrans("ACGTRYMKWSBDHVNacgtrymkwsbdhvn", + "TGCAYRKMWSVHDBNtgcayrkmwsvhdbn") #just to be sure that seq isn't unicode return str(seq).translate(tb) @@ -196,14 +196,14 @@ def complement(seq) : def translateDNA_6Frames(sequence) : """returns 6 translation of sequence. One for each reading frame""" trans = ( - translateDNA(sequence, 'f1'), - translateDNA(sequence, 'f2'), - translateDNA(sequence, 'f3'), + translateDNA(sequence, 'f1'), + translateDNA(sequence, 'f2'), + translateDNA(sequence, 'f3'), - translateDNA(sequence, 'r1'), - translateDNA(sequence, 'r2'), - translateDNA(sequence, 'r3'), - ) + translateDNA(sequence, 'r1'), + translateDNA(sequence, 'r2'), + translateDNA(sequence, 'r3'), + ) return trans @@ -252,7 +252,7 @@ def translateDNA(sequence, frame = 'f1', translTable_id='default') : def getSequenceCombinaisons(polymorphipolymorphicDnaSeqSeq, pos = 0) : """Takes a dna sequence with polymorphismes and returns all the possible sequences that it can yield""" - if type(polymorphipolymorphicDnaSeqSeq) is not types.ListType : + if type(polymorphipolymorphicDnaSeqSeq) is not list : seq = list(polymorphipolymorphicDnaSeqSeq) else : seq = polymorphipolymorphicDnaSeqSeq @@ -282,7 +282,7 @@ def encodePolymorphicNucleotide(polySeq) : in a single character. PolySeq must have one of the following forms: ['A', 'T', 'G'], 'ATG', 'A/T/G'""" - if type(polySeq) is types.StringType : + if type(polySeq) is str : if polySeq.find("/") < 0 : sseq = list(polySeq) else : @@ -348,18 +348,12 @@ def decodePolymorphicNucleotide_str(nuc) : def getNucleotideCodon(sequence, x1) : """Returns the entire codon of the nucleotide at pos x1 in sequence, and the position of that nocleotide in the codon in a tuple""" - - if x1 < 0 or x1 >= len(sequence) : + if 0 <= x1 < len(sequence): + p = x1 % 3 + return (sequence[x1 - p : x1 + 3 - p], p) + else: return None - p = x1%3 - if p == 0 : - return (sequence[x1: x1+3], 0) - elif p ==1 : - return (sequence[x1-1: x1+2], 1) - elif p == 2 : - return (sequence[x1-2: x1+1], 2) - def showDifferences(seq1, seq2) : """Returns a string highligthing differences between seq1 and seq2: @@ -394,7 +388,7 @@ def highlightSubsequence(sequence, x1, x2, start=' [', stop = '] ') : in bewteen 'start' and 'stop'""" seq = list(sequence) - print x1, x2-1, len(seq) + #print(x1, x2-1, len(seq)) seq[x1] = start + seq[x1] seq[x2-1] = seq[x2-1] + stop return ''.join(seq) diff --git a/pyGeno/tools/io.py b/pyGeno/tools/io.py index b3d940e..08b3ba2 100644 --- a/pyGeno/tools/io.py +++ b/pyGeno/tools/io.py @@ -3,21 +3,21 @@ def printf(*s) : 'print + sys.stdout.flush()' for e in s[:-1] : - print e, - print s[-1] + print(e, end=' ') + print(s[-1]) sys.stdout.flush() def enterConfirm_prompt(enterMsg) : stopi = False while not stopi : - print "====\n At any time you can quit by entering 'quit'\n====" - vali = raw_input(enterMsg) + print("====\n At any time you can quit by entering 'quit'\n====") + vali = input(enterMsg) if vali.lower() == 'quit' : vali = None stopi = True else : - print "You've entered:\n\t%s" % vali + print("You've entered:\n\t%s" % vali) valj = confirm_prompt("") if valj == 'yes' : stopi = True @@ -29,7 +29,7 @@ def enterConfirm_prompt(enterMsg) : def confirm_prompt(preMsg) : while True : - val = raw_input('%splease confirm ("yes", "no", "quit"): ' % preMsg) + val = input('%splease confirm ("yes", "no", "quit"): ' % preMsg) if val.lower() == 'yes' or val.lower() == 'no' or val.lower() == 'quit': return val.lower() diff --git a/pyGeno/tools/parsers/CSVTools.py b/pyGeno/tools/parsers/CSVTools.py index 17d9978..c085156 100644 --- a/pyGeno/tools/parsers/CSVTools.py +++ b/pyGeno/tools/parsers/CSVTools.py @@ -22,7 +22,7 @@ def removeDuplicates(inFileName, outFileName) : lines = f.readlines() for l in lines : - if not h.has_key(l) : + if l not in h : h[l] = 0 data += l @@ -52,7 +52,7 @@ def joinCSVs(csvFilePaths, column, ouputFileName, separator = ',') : c = CSVFile() c.parse(f) csvs.append(c) - legend.append(separator.join(c.legend.keys())) + legend.append(separator.join(list(c.legend.keys()))) legend = separator.join(legend) lines = [] @@ -97,10 +97,9 @@ def __init__(self, csvFile, lineNumber = None) : # for d in tmpData : d = tmpData[i] sd = d.strip() - # print sd, tmpData, i if len(sd) > 0 and sd[0] == csvFile.stringSeparator : more = [] - for i in xrange(i, len(tmpData)) : + for i in range(i, len(tmpData)) : more.append(tmpData[i]) i+=1 if more[-1][-1] == csvFile.stringSeparator : @@ -129,12 +128,12 @@ def __iter__(self) : self.currentField = -1 return self - def next(self) : + def __next__(self) : self.currentField += 1 if self.currentField >= len(self.csvFile.legend) : raise StopIteration - k = self.csvFile.legend.keys()[self.currentField] + k = list(self.csvFile.legend.keys())[self.currentField] v = self.data[self.currentField] return k, v @@ -159,7 +158,7 @@ def __setitem__(self, key, value) : try: self.data[field] = str(value) except Exception as e: - for i in xrange(field-len(self.data)+1) : + for i in range(field-len(self.data)+1) : self.data.append("") self.data[field] = str(value) @@ -168,7 +167,7 @@ def toStr(self) : def __repr__(self) : r = {} - for k, v in self.csvFile.legend.iteritems() : + for k, v in self.csvFile.legend.items() : r[k] = self.data[v] return "" %(self.lineNumber, str(r)) @@ -184,7 +183,7 @@ class CSVFile(object) : f = CSVFile() f.parse('hop.csv') for line in f : - print line['ref'] + print(line['ref']) #writing, legend can either be a list of a dict {field : column number} f = CSVFile(legend = ['name', 'email']) @@ -193,7 +192,7 @@ class CSVFile(object) : l['email'] = "hop@gmail.com" for field, value in l : - print field, value + print(field, value) f.save('myCSV.csv') """ @@ -244,7 +243,6 @@ def parse(self, filePath, skipLines=0, separator = ',', stringSeparator = '"', l self.lines = [] self.comments = [] for l in lines : - # print l if len(l) != 0 and l[0] != "#" : self.lines.append(l) elif l[0] == "#" : @@ -303,7 +301,7 @@ def commitLine(self, line) : self.streamBuffer.append(line) if len(self.streamBuffer) % self.writeRate == 0 : - for i in xrange(len(self.streamBuffer)) : + for i in range(len(self.streamBuffer)) : self.streamBuffer[i] = str(self.streamBuffer[i]) self.streamFile.write("%s\n" % ('\n'.join(self.streamBuffer))) self.streamFile.flush() @@ -314,7 +312,7 @@ def closeStreamToFile(self) : if self.streamBuffer is None : raise ValueError("Commit lines is only for when you are streaming to a file") - for i in xrange(len(self.streamBuffer)) : + for i in range(len(self.streamBuffer)) : self.streamBuffer[i] = str(self.streamBuffer[i]) self.streamFile.write('\n'.join(self.streamBuffer)) self.streamFile.close() @@ -378,7 +376,7 @@ def __iter__(self) : self.currentPos = -1 return self - def next(self) : + def __next__(self) : self.currentPos += 1 if self.currentPos >= len(self) : raise StopIteration @@ -395,7 +393,7 @@ def __getitem__(self, line) : if start is None : start = 0 - for l in xrange(len(self.lines[line])) : + for l in range(len(self.lines[line])) : self._developLine(l + start) # start, stop = line.start, line.stop diff --git a/pyGeno/tools/parsers/CasavaTools.py b/pyGeno/tools/parsers/CasavaTools.py index 4111aef..4c7599b 100644 --- a/pyGeno/tools/parsers/CasavaTools.py +++ b/pyGeno/tools/parsers/CasavaTools.py @@ -44,7 +44,7 @@ class SNPsTxtFile(object) : f = SNPsTxtFile('snps.txt') for line in f : - print line['ref'] + print(line['ref']) """ def __init__(self, fil, gziped = False) : @@ -52,7 +52,7 @@ def __init__(self, fil, gziped = False) : if not gziped : f = open(fil) else : - f = gzip.open(fil) + f = gzip.open(fil, 'rt') for l in f : if l[0] != '#' : @@ -69,7 +69,7 @@ def __iter__(self) : self.currentPos = 0 return self - def next(self) : + def __next__(self) : if self.currentPos >= len(self) : raise StopIteration() v = self[self.currentPos] diff --git a/pyGeno/tools/parsers/FastaTools.py b/pyGeno/tools/parsers/FastaTools.py index e09213a..19ec39f 100644 --- a/pyGeno/tools/parsers/FastaTools.py +++ b/pyGeno/tools/parsers/FastaTools.py @@ -8,7 +8,7 @@ class FastaFile(object) : f = FastaFile() f.parseFile('hop.fasta') for line in f : - print line + print(line) #writing f = FastaFile() @@ -74,10 +74,9 @@ def __iter__(self) : self.currentPos = 0 return self - def next(self) : + def __next__(self) : #self to call getitem, and split he line if necessary i = self.currentPos +1 - #print i-1, self.currentPos if i > len(self) : raise StopIteration() diff --git a/pyGeno/tools/parsers/FastqTools.py b/pyGeno/tools/parsers/FastqTools.py index 2e722fd..8918281 100644 --- a/pyGeno/tools/parsers/FastqTools.py +++ b/pyGeno/tools/parsers/FastqTools.py @@ -27,7 +27,7 @@ class FastqFile(object) : f = FastqFile() f.parse('hop.fastq') for line in f : - print line['sequence'] + print(line['sequence']) #writing, legend can either be a list of a dict {field : column number} f = CSVFile(legend = ['name', 'email']) @@ -99,10 +99,9 @@ def __iter__(self) : self.currentPos = 0 return self - def next(self) : + def __next__(self) : #self to call getitem, and split he line if necessary i = self.currentPos +1 - #print i-1, self.currentPos if i > len(self) : raise StopIteration() @@ -121,4 +120,4 @@ def __setitem__(self, i, v) : self.data[i] = v def __len__(self) : - return len(self.data)/4 + return len(self.data)//4 diff --git a/pyGeno/tools/parsers/GTFTools.py b/pyGeno/tools/parsers/GTFTools.py index b555140..9585ea4 100644 --- a/pyGeno/tools/parsers/GTFTools.py +++ b/pyGeno/tools/parsers/GTFTools.py @@ -1,81 +1,404 @@ import gzip +import os class GTFEntry(object) : - def __init__(self, gtfFile, lineNumber) : - """A single entry in a GTF file""" - - self.lineNumber = lineNumber - self.gtfFile = gtfFile - self.data = gtfFile.lines[lineNumber][:-2].split('\t') #-2 remove ';\n' - proto_atts = self.data[gtfFile.legend['attributes']].strip().split('; ') - atts = {} - for a in proto_atts : - sa = a.split(' ') - atts[sa[0]] = sa[1].replace('"', '') - self.data[gtfFile.legend['attributes']] = atts - - def __getitem__(self, k) : - try : - return self.data[self.gtfFile.legend[k]] - except KeyError : - try : - return self.data[self.gtfFile.legend['attributes']][k] - except KeyError : - #return None - raise KeyError("Line %d does not have an element %s.\nline:%s" %(self.lineNumber, k, self.gtfFile.lines[self.lineNumber])) - - def __repr__(self) : - return "" % self.lineNumber - - def __str__(self) : - return "" % (self.lineNumber, str(self.data)) + def __init__(self, gtfFile, lineNumber) : + """A single entry in a GTF file""" + + self.lineNumber = lineNumber + self.gtfFile = gtfFile + self.data = gtfFile.lines[lineNumber][:-2].split('\t') #-2 remove ';\n' + proto_atts = self.data[gtfFile.legend['attributes']].strip().split('; ') + atts = {} + for a in proto_atts : + sa = a.split(' ') + k = sa[0] + v = sa[1].replace('"', '') + if k not in atts: + atts[k] = v + else: + if type(atts[k]) != list: + atts[k] = [atts[k], v] + else: + atts[k].append(v) + self.data[gtfFile.legend['attributes']] = atts + + def __getitem__(self, k) : + try : + return self.data[self.gtfFile.legend[k]] + except KeyError : + try : + return self.data[self.gtfFile.legend['attributes']][k] + except KeyError : + #return None + raise KeyError( + "Line %d does not have an element %s.\nline:%s" % ( + self.lineNumber, k, self.gtfFile.lines[self.lineNumber]) + ) + + def __repr__(self) : + return "" % self.lineNumber + + def __str__(self) : + return "" % (self.lineNumber, str(self.data)) class GTFFile(object) : - """This is a simple GTF2.2 (Revised Ensembl GTF) parser, see http://mblab.wustl.edu/GTF22.html for more infos""" - def __init__(self, filename, gziped = False) : - - self.filename = filename - self.legend = {'seqname' : 0, 'source' : 1, 'feature' : 2, 'start' : 3, 'end' : 4, 'score' : 5, 'strand' : 6, 'frame' : 7, 'attributes' : 8} + """This is a simple GTF2.2 (Revised Ensembl GTF) parser, + see http://mblab.wustl.edu/GTF22.html for more infos + """ + def __init__(self, filename, gziped = False) : + + self.filename = filename + self.legend = { + 'seqname' : 0, 'source' : 1, 'feature' : 2, 'start' : 3, + 'end' : 4, 'score' : 5, 'strand' : 6, 'frame' : 7, 'attributes' : 8} + self.gziped = gziped - if gziped : - f = gzip.open(filename) - else : - f = open(filename) - - self.lines = [] - for l in f : - if l[0] != '#' and l != '' : - self.lines.append(l) - f.close() - - self.currentIt = -1 + if gziped : + f = gzip.open(filename, 'rt') + else : + f = open(filename) + + self.lines = [] + for l in f : + if l[0] != '#' and l != '' : + self.lines.append(l) + f.close() + + self.currentPos = -1 + + def get(self, line, elmt) : + """returns the value of the field 'elmt' of line 'line'""" + return self[line][elmt] - def get(self, line, elmt) : - """returns the value of the field 'elmt' of line 'line'""" - return self[line][elmt] + def get_transcripts(self, transcript_ids=None): + """returns genes with its transcripts and associated exons and CDSs from a GTF + if transcript_ids is used, only these transcripts will be returned + """ + + if transcript_ids is not None: + #transcript_ids = [y for x in transcript_ids for y in [x, x.split('.')[0]]] + transcript_ids = set(transcript_ids) if transcript_ids is not None else transcript_ids + + gtf = iter(self) + + end_reached = False + + # Iterate through all genes in file + + # first line + try: + line = next(gtf) + except: + raise ('Empty GTF file') + + # pop-out lines that come before first gene + while line['feature'] != 'gene': + try: + line = next(gtf) + except: + raise ('Never encountered a gene in the GTF') + + # Get all genes + while line['feature'] == 'gene': + gene = line + transcripts = [] + exons = [] + cdss = [] + start_codons = [] + stop_codons = [] + + # pop-out all lines that come before first transcript in gene + while line['feature'] != 'transcript': + try: + line = next(gtf) + except: + raise StopIteration('Last gene %s contains no transcripts' % gene['gene_id']) + + # Get all transcripts in gene + while line['feature'] == 'transcript': + tr = line + ex = [] + cds = [] + start_codon = [] + stop_codon = [] + + try: + line = next(gtf) + except: + raise StopIteration('Last transcript %s contains no features' % tr['transcript_id']) + + # Get all features in transcript + while line['feature'] != 'transcript' and line['feature'] != 'gene': + if line['feature'] == 'exon': + ex.append(line) + if line['feature'] == 'CDS': + cds.append(line) + if line['feature'] == 'start_codon': + start_codon.append(line) + if line['feature'] == 'stop_codon': + stop_codon.append(line) + + try: + line = next(gtf) + except StopIteration: + end_reached = True + break + + if transcript_ids is None or tr[0]['transcript_id'] in transcript_ids: + transcripts.append(tr) + exons.append(ex) + cdss.append(cds) + start_codons.append(start_codon) + stop_codons.append(stop_codon) + + assert len(transcripts) + yield (gene, transcripts, exons, cdss, start_codons, stop_codons) + + assert end_reached # in a normal GTF file, this value should be true + assert self.currentPos == len(self) + + def gtf2bed(self, bed_filename, feature='transcripts'): + """Transform gtf to bed6/bed12 and saves the output to file""" + + if feature == 'transcripts': + return self.gtf2bed_transcripts(bed_filename) + elif feature == 'exons': + return self.gtf2bed_exons(bed_filename) + elif feature == 'cds': + return self.gtf2bed_cds(bed_filename) + else: + raise ValueError('Accepted feature types are "transcripts", "exons" and "cds".') + + def gtf2bed_transcripts(self, bed_filename): + """Retrieves transcript information from gtf in bed12 format""" + + f_out = open(bed_filename, 'w') + for gene, transcripts, exons, cdss, start_codons, stop_codons in self.get_transcripts(): + for tr, ex, cds, stt, stp in zip(transcripts, exons, cdss, start_codons, stop_codons): + has_exon = bool(len(ex)) + assert has_exon + + has_stop_codon = bool(len(stp)) + if has_stop_codon: + stop_codon_length = 0 + for stp_cod in stp: + stop_codon_length += int(stp_cod['end']) - int(stp_cod['start']) + 1 + assert stop_codon_length == 3 + + chromosome = tr['seqname'] + tid = tr['transcript_id'] + strand = tr['strand'] + start = int(tr['start']) - 1 # 0-base + end = int(tr['end']) + + exon_lengths = [] + exon_start_pos = [] + + for ex_line in ex: + assert ex_line['transcript_id'] == tid + s = int(ex_line['start']) - 1 # 0-base + e = int(ex_line['end']) + exon_lengths.append(e-s) + exon_start_pos.append(s-start) + exon_number = ex_line['exon_number'] + + if len(cds): + cds_start_pos = float('inf') + cds_end_pos = -1 + for cds_line in cds: + assert cds_line['transcript_id'] == tid + s = int(cds_line['start']) - 1 # 0-base + e = int(cds_line['end']) + cds_start_pos = min(cds_start_pos, s) + cds_end_pos = max(cds_end_pos, e) + else: + cds_start_pos = start + cds_end_pos = end + + if strand == '+': + if has_stop_codon: + cds_end_pos += 3 + try: + assert cds_end_pos <= end + except AssertionError: + print("NOTE: Transcript %s has incorrect stop_codon annotation" % tid) + cds_end_pos -= 3 + elif strand == '-': + if has_stop_codon: + cds_start_pos -= 3 + try: + assert cds_start_pos >= start + except AssertionError: + print("NOTE: Transcript %s has incorrect stop_codon annotation" % tid) + cds_start_pos += 3 + exon_lengths = exon_lengths[::-1] + exon_start_pos = exon_start_pos[::-1] + else: + raise ValueError("Unrecognizable strand value: '%s'." % strand) + + entry = [chromosome, start, end, tid, '.', strand, cds_start_pos, + cds_end_pos, '.', exon_number, + ','.join([str(x) for x in exon_lengths]), + ','.join([str(x) for x in exon_start_pos])] + + f_out.write('\t'.join([str(x) for x in entry]) + '\n') + + f_out.close() + + return True + + def gtf2bed_exons(self, bed_filename, join_overlaps=True): + """Retrieves exon information from gtf in bed6 format""" + + chromosome_list = [] + exon_positions_dict = dict() + for gene, transcripts, exons, cdss, start_codons, stop_codons in self.get_transcripts(): + gene_exon_positions = [] + for tr, ex, cds, stt, stp in zip(transcripts, exons, cdss, start_codons, stop_codons): + if len(ex): + chromosome = tr['seqname'] + gid = tr['gene_id'] + strand = tr['strand'] + + if chromosome not in exon_positions_dict: + exon_positions_dict[chromosome] = [] + chromosome_list.append(chromosome) + + exon_positions = [(int(e['start'])-1, int(e['end'])) for e in ex] + + if strand == '+': + pass + elif strand == '-': + exon_positions = exon_positions[::-1] + else: + raise ValueError("Unrecognizable strand value: '%s'." % strand) + + gene_exon_positions.extend(exon_positions) + + # Join overlapping entries + if join_overlaps: + gene_exon_positions = sorted(list(set(gene_exon_positions))) + gene_exon_positions = self._join_ends(gene_exon_positions) + + for start, end in gene_exon_positions: + exon_positions_dict[chromosome].append((chromosome, start, end, gid, '.', strand)) + + f_out = open(bed_filename, 'w') + for chromosome in chromosome_list: + current_positions = list(set(exon_positions_dict[chromosome])) + current_positions = sorted(current_positions, key=lambda x: (x[1], x[2], x[3])) + for entry in current_positions: + f_out.write('\t'.join([str(x) for x in entry]) + '\n') + f_out.close() + + return True - def __iter__(self) : - self.currentPos = -1 - return self + def gtf2bed_cds(self, bed_filename, join_overlaps=True): + """Retrieves CDS information from gtf in bed6 format""" + + chromosome_list = [] + cds_positions_dict = dict() + for gene, transcripts, exons, cdss, start_codons, stop_codons in self.get_transcripts(): + gene_cds_positions = [] + for tr, ex, cds, stt, stp in zip(transcripts, exons, cdss, start_codons, stop_codons): + if len(cds): + chromosome = tr['seqname'] + gid = tr['gene_id'] + strand = tr['strand'] + + has_stop_codon = bool(len(stp)) + if has_stop_codon: + stop_codon_length = 0 + for stp_cod in stp: + stop_codon_length += int(stp_cod['end']) - int(stp_cod['start']) + 1 + assert stop_codon_length == 3 + + #try: + # has_stop_codon = 'cds_end_NF' not in set(tr['tag']) + #except KeyError: + # has_stop_codon = True + + if chromosome not in cds_positions_dict: + cds_positions_dict[chromosome] = [] + chromosome_list.append(chromosome) + + cds_positions = [(int(c['start'])-1, int(c['end'])) for c in cds] + + if strand == '+': + if has_stop_codon: + cds_positions[-1] = (cds_positions[-1][0], cds_positions[-1][1] + 3) + elif strand == '-': + if has_stop_codon: + cds_positions[-1] = (cds_positions[-1][0] - 3, cds_positions[-1][1]) + cds_positions = cds_positions[::-1] + else: + raise ValueError("Unrecognizable strand value: '%s'." % strand) + + gene_cds_positions.extend(cds_positions) + + # Join overlapping entries + if join_overlaps: + gene_cds_positions = sorted(list(set(gene_cds_positions))) + gene_cds_positions = self._join_ends(gene_cds_positions) + + for start, end in gene_cds_positions: + cds_positions_dict[chromosome].append((chromosome, start, end, gid, '.', strand)) + + f_out = open(bed_filename, 'w') + for chromosome in chromosome_list: + current_positions = list(set(cds_positions_dict[chromosome])) + current_positions = sorted(current_positions, key=lambda x: (x[1], x[2], x[3])) + for entry in current_positions: + f_out.write('\t'.join([str(x) for x in entry]) + '\n') + f_out.close() + + return True - def next(self) : - self.currentPos += 1 - try : - return GTFEntry(self, self.currentPos) - except IndexError: - raise StopIteration + def _join_ends(self, positions, kept=[]): + if not positions: + return kept + elif len(positions) == 1: + start, end = positions.pop(0) + return self._join_ends(positions, kept=kept+[(start, end)]) + else: + start, end = positions.pop(0) + start_next, end_next = positions[0] + while end >= start_next and len(positions): + start = min(start, start_next) + end = max(end, end_next) + _ = positions.pop(0) + if len(positions): + start_next, end_next = positions[0] + else: + break + return self._join_ends(positions, kept=kept+[(start, end)]) + + def __iter__(self) : + self.currentPos = -1 + return self - def __getitem__(self, i) : - """returns the ith entry""" - if self.lines[i].__class__ is not GTFEntry : - self.lines[i] = GTFEntry(self, i) - return self.lines[i] + def __next__(self) : + self.currentPos += 1 + try : + return GTFEntry(self, self.currentPos) + except IndexError: + raise StopIteration - def __repr__(self) : - return "" % (os.path.basename(self.filename)) + def __getitem__(self, i) : + """returns the ith entry""" + if self.lines[i].__class__ is not GTFEntry : + self.lines[i] = GTFEntry(self, i) + return self.lines[i] - def __str__(self) : - return "" % (os.path.basename(self.filename), self.gziped, len(self)) - - def __len__(self) : - return len(self.lines) + def __repr__(self) : + return "" % (os.path.basename(self.filename)) + + def __str__(self) : + return "" % ( + os.path.basename(self.filename), self.gziped, len(self), self.currentPos) + return "" % ( + os.path.basename(self.filename), self.gziped, len(self)) + + def __len__(self) : + return len(self.lines) diff --git a/pyGeno/tools/parsers/VCFTools.py b/pyGeno/tools/parsers/VCFTools.py index 5a6922f..0196e89 100644 --- a/pyGeno/tools/parsers/VCFTools.py +++ b/pyGeno/tools/parsers/VCFTools.py @@ -77,7 +77,7 @@ class VCFFile(object) : f = VCFFile() f.parse('hop.vcf') for line in f : - print line['pos'] + print(line['pos']) """ def __init__(self, filename = None, gziped = False, stream = False) : @@ -93,7 +93,7 @@ def parse(self, filename, gziped = False, stream = False) : self.stream = stream if gziped : - self.f = gzip.open(filename) + self.f = gzip.open(filename, 'rt') else : self.f = open(filename) @@ -153,7 +153,7 @@ def __iter__(self) : self.currentPos = -1 return self - def next(self) : + def __next__(self) : self.currentPos += 1 if not self.stream : try : @@ -201,7 +201,6 @@ def __str__(self) : i = 0 pBar = ProgressBar() for f in v : - #print f pBar.update('%s' % i) if i > 1000000 : break @@ -218,7 +217,4 @@ def __str__(self) : if i > 1000000 : break i += 1 - #print f pBar.close() - #print v.lines - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f3d811c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyGeno" +version = "2.0.1" +description = "A python package for Personalized Genomics and Proteomics" +readme = "DESCRIPTION.rst" +license = "Apache-2.0" +requires-python = ">=3.8" +authors = [ + { name = "Tariq Daouda", email = "tariq.daouda@umontreal.ca" }, +] +keywords = [ + "proteogenomics", + "genomics", + "proteomics", + "annotations", + "medicine", + "research", + "personalized", + "gene", + "sequence", + "protein", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Healthcare Industry", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +dependencies = [ + "rabaDB>=2.0.0", +] + +[project.urls] +Homepage = "http://pyGeno.iric.ca" + +[project.scripts] +sample = "sample:main" + +[tool.setuptools.packages.find] +include = ["pyGeno*"] + +[tool.setuptools.package-data] +pyGeno = ["bootstrap_data/*/*"] diff --git a/setup.py b/setup.py index dd1778f..cada0dc 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ from setuptools import setup, find_packages -from codecs import open from os import path here = path.abspath(path.dirname(__file__)) @@ -11,7 +10,7 @@ setup( name='pyGeno', - version='1.3.2', + version='2.0.1', description='A python package for Personalized Genomics and Proteomics', long_description=long_description, @@ -21,7 +20,7 @@ author='Tariq Daouda', author_email='tariq.daouda@umontreal.ca', - test_suite="pyGeno.tests", + test_suite="pyGeno.tests", license='ApacheV2.0', @@ -41,9 +40,19 @@ 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', + 'Programming Language :: Python :: 3.14', ], + python_requires='>=3.8', + + keywords='proteogenomics genomics proteomics annotations medicine research personalized gene sequence protein', packages=find_packages(exclude=[]), @@ -52,13 +61,13 @@ # project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/technical.html#install-requires-vs-requirements-files - install_requires=['rabaDB >= 1.0.5'], + install_requires=['rabaDB >= 2.0.0'], # If there are data files included in your packages that need to be - # installed, specify them here. If using Python 2.6 or less, then these - # have to be included in MANIFEST.in as well. + # installed, specify them here. OBSOLETE [If using Python 2.6 or less, then these + # have to be included in MANIFEST.in as well.] package_data={ - '': ['*.txt', '*.rst', '*.tar.gz'], + 'pyGeno': ['bootstrap_data/*/*'], }, # Although 'package_data' is the preferred approach, in some case you may