-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_assemblies_from_acc_list.py
39 lines (32 loc) · 1.08 KB
/
get_assemblies_from_acc_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
desired_width=320
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)
import urllib
import os
from Bio import Entrez
Entrez.email = "[email protected]"
type='gff'
prefix='_genomic'
acc_list = ['NZ_LS483319.1', 'NZ_CP043302.1 ', 'NZ_CP026961.1']
def get_ids(term):
# finds the ids associated with the assembly
ids = []
handle = Entrez.esearch(db="assembly", term=term)
record = Entrez.read(handle)
ids.append(record["IdList"])
return ids
#Fetch raw output
def get_raw_assembly_summary(id):
handle = Entrez.esummary(db="assembly", id=id, report="full")
record = Entrez.read(handle)
return(record['DocumentSummarySet']['DocumentSummary'][0])
for acc in acc_list:
for id in get_ids(acc):
summary = get_raw_assembly_summary(id)
url = summary['FtpPath_RefSeq']
label = os.path.basename(url)
link = ''.join(f'{url}/{label}{prefix}.{type}.gz')
file_path = f'{label}.{type}.gz'
print(f'Download {label}')
urllib.request.urlretrieve(link, file_path)