|
| 1 | +'''Write TSV file of high-quality genome assemblies by organism |
| 2 | +
|
| 3 | +Output file has one row per assembly, with columns for: |
| 4 | +
|
| 5 | +* Organism scientific name (e.g. Homo sapiens, Macaca fascicularis) |
| 6 | +* Organism common name (e.g. human, crab-eating macaque) |
| 7 | +* Assembly name (e.g. GRCh38, Macaca_fascicularis_5.0) |
| 8 | +* Assembly accession (e.g. GCA_000001405.28, GCA_000364345.1) |
| 9 | +
|
| 10 | +EXAMPLES |
| 11 | +
|
| 12 | +python3 list_assemblies.py |
| 13 | +''' |
| 14 | + |
| 15 | +import argparse |
| 16 | +import json |
| 17 | +import math |
| 18 | +import os |
| 19 | +import urllib.request as request |
| 20 | + |
| 21 | +parser = argparse.ArgumentParser( |
| 22 | + description=__doc__, |
| 23 | + formatter_class=argparse.RawDescriptionHelpFormatter) |
| 24 | +parser.add_argument('--output-dir', |
| 25 | + help='Directory to send output data to', |
| 26 | + default='') |
| 27 | + |
| 28 | +args = parser.parse_args() |
| 29 | +output_dir = args.output_dir |
| 30 | + |
| 31 | +if len(output_dir) > 0 and output_dir[0] != '': |
| 32 | + output_dir = '/' + output_dir |
| 33 | + |
| 34 | +asms_path = 'assembly_summary.txt' |
| 35 | +asms_path_historical = 'assembly_summary_historical.txt' |
| 36 | + |
| 37 | +eutils_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils' |
| 38 | +esummary_base = eutils_base + '/esummary.fcgi' |
| 39 | + |
| 40 | +def fetch_asm_summary(group, group_asms_path, group_asms_historical_path): |
| 41 | + '''Retrieve NCBI assembly summary TSV file by organism group |
| 42 | +
|
| 43 | + Fetched files are written locally as a cache |
| 44 | + ''' |
| 45 | + |
| 46 | + path_versions = { |
| 47 | + 'current': group_asms_path, |
| 48 | + 'historical': group_asms_historical_path, |
| 49 | + } |
| 50 | + for version in path_versions: |
| 51 | + path = path_versions[version] |
| 52 | + if version is 'current': |
| 53 | + leaf = f'{group}/{asms_path}' |
| 54 | + else: |
| 55 | + leaf = f'{group}/{asms_path_historical}' |
| 56 | + |
| 57 | + if os.path.exists(path) == False: |
| 58 | + # Example URL: |
| 59 | + # https://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/assembly_summary.txt |
| 60 | + url = f'https://ftp.ncbi.nlm.nih.gov/genomes/genbank/{leaf}' |
| 61 | + with request.urlopen(url) as response: |
| 62 | + data = response.read().decode('utf-8') |
| 63 | + with open(path, 'w') as f: |
| 64 | + f.write(data) |
| 65 | + |
| 66 | +def get_taxid_chunks(taxids): |
| 67 | + '''Return taxids in comma-delimited lists of 500 |
| 68 | + Needed because NCBI EUtils limits parameters to 500 values each. |
| 69 | + ''' |
| 70 | + taxid_chunks = [] |
| 71 | + |
| 72 | + eutils_limit = 500 |
| 73 | + |
| 74 | + num_requests = math.ceil(len(taxids) / eutils_limit) |
| 75 | + for num in range(0, num_requests): |
| 76 | + pos = num * eutils_limit |
| 77 | + upper = (num + 1) * eutils_limit |
| 78 | + if upper < len(taxids): |
| 79 | + size = upper |
| 80 | + else: |
| 81 | + size = pos + upper - len(taxids) |
| 82 | + taxids_segment = taxids[pos:size] |
| 83 | + taxids_chunk = ','.join(taxids_segment) |
| 84 | + taxid_chunks.append(taxids_chunk) |
| 85 | + |
| 86 | + return taxid_chunks |
| 87 | + |
| 88 | +def add_common_names(asms): |
| 89 | + '''Add organism common names (e.g. human) to genome assembly objects |
| 90 | + ''' |
| 91 | + # NCBI Taxonomy identifiers, an organism ID. Human: 9606, etc. |
| 92 | + taxids = [asm['taxid'] for asm in asms] |
| 93 | + taxids = list(set(taxids)) # deduplicate |
| 94 | + |
| 95 | + names_by_taxid = {} |
| 96 | + |
| 97 | + taxid_chunks = get_taxid_chunks(taxids) |
| 98 | + |
| 99 | + for taxid_chunk in taxid_chunks: |
| 100 | + eutils_url = f'{esummary_base}?db=taxonomy&format=json&id={taxid_chunk}' |
| 101 | + with request.urlopen(eutils_url) as response: |
| 102 | + data = json.loads(response.read().decode('utf-8')) |
| 103 | + taxid_json = data['result'] |
| 104 | + del taxid_json['uids'] # Remove placeholder from relevant data |
| 105 | + |
| 106 | + for taxid in taxid_json: |
| 107 | + names_by_taxid[taxid] = taxid_json[taxid]['commonname'] |
| 108 | + |
| 109 | + new_asms = [] |
| 110 | + for asm in asms: |
| 111 | + new_asm = asm |
| 112 | + new_asm['organism_common_name'] = names_by_taxid[asm['taxid']] |
| 113 | + new_asms.append(new_asm) |
| 114 | + |
| 115 | + return new_asms |
| 116 | + |
| 117 | +def parse_current_assemblies(group, group_asms_path): |
| 118 | + ''' Parse current genome assemblies |
| 119 | + ''' |
| 120 | + |
| 121 | + asms = [] |
| 122 | + |
| 123 | + with open(group_asms_path) as f: |
| 124 | + asms_file = f.readlines() |
| 125 | + |
| 126 | + # For reference, expected headers are: |
| 127 | + # assembly_accession, bioproject, biosample, wgs_master, refseq_category, |
| 128 | + # taxid, species_taxid, organism_name, infraspecific_name, isolate, |
| 129 | + # version_status, assembly_level, release_type, genome_rep, seq_rel_date, |
| 130 | + # asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, |
| 131 | + # excluded_from_refseq, relation_to_type_material |
| 132 | + headers = asms_file[1].replace('# ', '').strip().split('\t') |
| 133 | + |
| 134 | + for line in asms_file[2:]: |
| 135 | + columns = line.strip().split('\t') |
| 136 | + asm = dict(zip(headers, columns)) |
| 137 | + |
| 138 | + categories = ['representative genome', 'reference genome'] |
| 139 | + |
| 140 | + if ( |
| 141 | + asm['assembly_level'] != 'Chromosome' or |
| 142 | + asm['release_type'] != 'Major' or |
| 143 | + asm['refseq_category'] not in categories |
| 144 | + ): |
| 145 | + continue |
| 146 | + |
| 147 | + asm['organism_group'] = group |
| 148 | + |
| 149 | + asms.append(asm) |
| 150 | + |
| 151 | + return asms |
| 152 | + |
| 153 | +def parse_historical_assemblies(group, group_asms_historical_path): |
| 154 | + asms = [] |
| 155 | + |
| 156 | + with open(group_asms_historical_path) as f: |
| 157 | + asms_file = f.readlines() |
| 158 | + |
| 159 | + # See parse_current_assemblies for expected headers |
| 160 | + headers = asms_file[1].replace('# ', '').strip().split('\t') |
| 161 | + |
| 162 | + for line in asms_file[2:]: |
| 163 | + columns = line.strip().split('\t') |
| 164 | + asm = dict(zip(headers, columns)) |
| 165 | + |
| 166 | + if ( |
| 167 | + asm['assembly_level'] != 'Chromosome' or |
| 168 | + asm['release_type'] != 'Major' or |
| 169 | + asm['submitter'] != 'Genome Reference Consortium' |
| 170 | + ): |
| 171 | + continue |
| 172 | + |
| 173 | + asm['organism_group'] = group |
| 174 | + |
| 175 | + asms.append(asm) |
| 176 | + |
| 177 | + return asms |
| 178 | + |
| 179 | +def get_assemblies(): |
| 180 | + '''Fetch metadata on genome assemblies from NCBI, and write it locally |
| 181 | +
|
| 182 | + Background: Genome assemblies are sequenced chromosomes for an organism. |
| 183 | +
|
| 184 | + Here, we get metadata on each assembly, like its name (e.g. GRCh38), |
| 185 | + accession (an identifier like GCA_000001405.15), the organism's scientific |
| 186 | + and common names (Homo sapiens, human) and more. |
| 187 | + ''' |
| 188 | + |
| 189 | + groups = [ |
| 190 | + 'fungi', |
| 191 | + 'invertebrate', |
| 192 | + 'plant', |
| 193 | + 'protozoa', |
| 194 | + 'vertebrate_mammalian', |
| 195 | + 'vertebrate_other' |
| 196 | + ] |
| 197 | + |
| 198 | + asms = [] |
| 199 | + |
| 200 | + for group in groups: |
| 201 | + group_asms = [] |
| 202 | + group_asms_path = f'{group}_{asms_path}' |
| 203 | + group_asms_historical_path = f'{group}_{asms_path_historical}' |
| 204 | + fetch_asm_summary(group, group_asms_path, group_asms_historical_path) |
| 205 | + |
| 206 | + group_asms += parse_current_assemblies(group, group_asms_path) |
| 207 | + group_asms +=\ |
| 208 | + parse_historical_assemblies(group, group_asms_historical_path) |
| 209 | + |
| 210 | + group_asms = sorted(group_asms, key=lambda asm: asm['organism_name']) |
| 211 | + asms += group_asms |
| 212 | + |
| 213 | + asms = add_common_names(asms) |
| 214 | + |
| 215 | + return asms |
| 216 | + |
| 217 | +asms = get_assemblies() |
| 218 | + |
| 219 | +asm_list = [] |
| 220 | + |
| 221 | +output_headers = [ |
| 222 | + 'organism_name', |
| 223 | + 'organism_common_name', |
| 224 | + 'asm_name', |
| 225 | + 'assembly_accession' |
| 226 | +] |
| 227 | +for asm in asms: |
| 228 | + asm_entry = [asm[header] for header in output_headers] |
| 229 | + asm_list.append('\t'.join(asm_entry)) |
| 230 | + |
| 231 | +header = '# ' + '\t'.join(output_headers) |
| 232 | +asm_list.insert(0, header) |
| 233 | + |
| 234 | +content = '\n'.join(asm_list) |
| 235 | + |
| 236 | +output_file = output_dir + 'assemblies.tsv' |
| 237 | + |
| 238 | +open(output_file, 'w').write(content) |
| 239 | + |
| 240 | +print(f'Wrote list of {len(asm_list)} assemblies to {output_file}') |
0 commit comments