Skip to content

Commit 1fd0244

Browse files
authored
Merge pull request #206 from eweitz/assemblies-list
List assemblies
2 parents 77c4e32 + 1859852 commit 1fd0244

1 file changed

Lines changed: 240 additions & 0 deletions

File tree

scripts/python/list_assemblies.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
'''Write TSV file of high-quality genome assemblies by organism
2+
3+
Output file has one row per assembly, with columns for:
4+
5+
* Organism scientific name (e.g. Homo sapiens, Macaca fascicularis)
6+
* Organism common name (e.g. human, crab-eating macaque)
7+
* Assembly name (e.g. GRCh38, Macaca_fascicularis_5.0)
8+
* Assembly accession (e.g. GCA_000001405.28, GCA_000364345.1)
9+
10+
EXAMPLES
11+
12+
python3 list_assemblies.py
13+
'''
14+
15+
import argparse
16+
import json
17+
import math
18+
import os
19+
import urllib.request as request
20+
21+
parser = argparse.ArgumentParser(
22+
description=__doc__,
23+
formatter_class=argparse.RawDescriptionHelpFormatter)
24+
parser.add_argument('--output-dir',
25+
help='Directory to send output data to',
26+
default='')
27+
28+
args = parser.parse_args()
29+
output_dir = args.output_dir
30+
31+
if len(output_dir) > 0 and output_dir[0] != '':
32+
output_dir = '/' + output_dir
33+
34+
asms_path = 'assembly_summary.txt'
35+
asms_path_historical = 'assembly_summary_historical.txt'
36+
37+
eutils_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
38+
esummary_base = eutils_base + '/esummary.fcgi'
39+
40+
def fetch_asm_summary(group, group_asms_path, group_asms_historical_path):
41+
'''Retrieve NCBI assembly summary TSV file by organism group
42+
43+
Fetched files are written locally as a cache
44+
'''
45+
46+
path_versions = {
47+
'current': group_asms_path,
48+
'historical': group_asms_historical_path,
49+
}
50+
for version in path_versions:
51+
path = path_versions[version]
52+
if version is 'current':
53+
leaf = f'{group}/{asms_path}'
54+
else:
55+
leaf = f'{group}/{asms_path_historical}'
56+
57+
if os.path.exists(path) == False:
58+
# Example URL:
59+
# https://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/assembly_summary.txt
60+
url = f'https://ftp.ncbi.nlm.nih.gov/genomes/genbank/{leaf}'
61+
with request.urlopen(url) as response:
62+
data = response.read().decode('utf-8')
63+
with open(path, 'w') as f:
64+
f.write(data)
65+
66+
def get_taxid_chunks(taxids):
67+
'''Return taxids in comma-delimited lists of 500
68+
Needed because NCBI EUtils limits parameters to 500 values each.
69+
'''
70+
taxid_chunks = []
71+
72+
eutils_limit = 500
73+
74+
num_requests = math.ceil(len(taxids) / eutils_limit)
75+
for num in range(0, num_requests):
76+
pos = num * eutils_limit
77+
upper = (num + 1) * eutils_limit
78+
if upper < len(taxids):
79+
size = upper
80+
else:
81+
size = pos + upper - len(taxids)
82+
taxids_segment = taxids[pos:size]
83+
taxids_chunk = ','.join(taxids_segment)
84+
taxid_chunks.append(taxids_chunk)
85+
86+
return taxid_chunks
87+
88+
def add_common_names(asms):
89+
'''Add organism common names (e.g. human) to genome assembly objects
90+
'''
91+
# NCBI Taxonomy identifiers, an organism ID. Human: 9606, etc.
92+
taxids = [asm['taxid'] for asm in asms]
93+
taxids = list(set(taxids)) # deduplicate
94+
95+
names_by_taxid = {}
96+
97+
taxid_chunks = get_taxid_chunks(taxids)
98+
99+
for taxid_chunk in taxid_chunks:
100+
eutils_url = f'{esummary_base}?db=taxonomy&format=json&id={taxid_chunk}'
101+
with request.urlopen(eutils_url) as response:
102+
data = json.loads(response.read().decode('utf-8'))
103+
taxid_json = data['result']
104+
del taxid_json['uids'] # Remove placeholder from relevant data
105+
106+
for taxid in taxid_json:
107+
names_by_taxid[taxid] = taxid_json[taxid]['commonname']
108+
109+
new_asms = []
110+
for asm in asms:
111+
new_asm = asm
112+
new_asm['organism_common_name'] = names_by_taxid[asm['taxid']]
113+
new_asms.append(new_asm)
114+
115+
return new_asms
116+
117+
def parse_current_assemblies(group, group_asms_path):
118+
''' Parse current genome assemblies
119+
'''
120+
121+
asms = []
122+
123+
with open(group_asms_path) as f:
124+
asms_file = f.readlines()
125+
126+
# For reference, expected headers are:
127+
# assembly_accession, bioproject, biosample, wgs_master, refseq_category,
128+
# taxid, species_taxid, organism_name, infraspecific_name, isolate,
129+
# version_status, assembly_level, release_type, genome_rep, seq_rel_date,
130+
# asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path,
131+
# excluded_from_refseq, relation_to_type_material
132+
headers = asms_file[1].replace('# ', '').strip().split('\t')
133+
134+
for line in asms_file[2:]:
135+
columns = line.strip().split('\t')
136+
asm = dict(zip(headers, columns))
137+
138+
categories = ['representative genome', 'reference genome']
139+
140+
if (
141+
asm['assembly_level'] != 'Chromosome' or
142+
asm['release_type'] != 'Major' or
143+
asm['refseq_category'] not in categories
144+
):
145+
continue
146+
147+
asm['organism_group'] = group
148+
149+
asms.append(asm)
150+
151+
return asms
152+
153+
def parse_historical_assemblies(group, group_asms_historical_path):
154+
asms = []
155+
156+
with open(group_asms_historical_path) as f:
157+
asms_file = f.readlines()
158+
159+
# See parse_current_assemblies for expected headers
160+
headers = asms_file[1].replace('# ', '').strip().split('\t')
161+
162+
for line in asms_file[2:]:
163+
columns = line.strip().split('\t')
164+
asm = dict(zip(headers, columns))
165+
166+
if (
167+
asm['assembly_level'] != 'Chromosome' or
168+
asm['release_type'] != 'Major' or
169+
asm['submitter'] != 'Genome Reference Consortium'
170+
):
171+
continue
172+
173+
asm['organism_group'] = group
174+
175+
asms.append(asm)
176+
177+
return asms
178+
179+
def get_assemblies():
180+
'''Fetch metadata on genome assemblies from NCBI, and write it locally
181+
182+
Background: Genome assemblies are sequenced chromosomes for an organism.
183+
184+
Here, we get metadata on each assembly, like its name (e.g. GRCh38),
185+
accession (an identifier like GCA_000001405.15), the organism's scientific
186+
and common names (Homo sapiens, human) and more.
187+
'''
188+
189+
groups = [
190+
'fungi',
191+
'invertebrate',
192+
'plant',
193+
'protozoa',
194+
'vertebrate_mammalian',
195+
'vertebrate_other'
196+
]
197+
198+
asms = []
199+
200+
for group in groups:
201+
group_asms = []
202+
group_asms_path = f'{group}_{asms_path}'
203+
group_asms_historical_path = f'{group}_{asms_path_historical}'
204+
fetch_asm_summary(group, group_asms_path, group_asms_historical_path)
205+
206+
group_asms += parse_current_assemblies(group, group_asms_path)
207+
group_asms +=\
208+
parse_historical_assemblies(group, group_asms_historical_path)
209+
210+
group_asms = sorted(group_asms, key=lambda asm: asm['organism_name'])
211+
asms += group_asms
212+
213+
asms = add_common_names(asms)
214+
215+
return asms
216+
217+
asms = get_assemblies()
218+
219+
asm_list = []
220+
221+
output_headers = [
222+
'organism_name',
223+
'organism_common_name',
224+
'asm_name',
225+
'assembly_accession'
226+
]
227+
for asm in asms:
228+
asm_entry = [asm[header] for header in output_headers]
229+
asm_list.append('\t'.join(asm_entry))
230+
231+
header = '# ' + '\t'.join(output_headers)
232+
asm_list.insert(0, header)
233+
234+
content = '\n'.join(asm_list)
235+
236+
output_file = output_dir + 'assemblies.tsv'
237+
238+
open(output_file, 'w').write(content)
239+
240+
print(f'Wrote list of {len(asm_list)} assemblies to {output_file}')

0 commit comments

Comments
 (0)