blob: c897e9c3f624fbee742ca78c49abedf1fd7e0d3a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
# Purpose: Download missing experiment info one by one into ../Data/information/EXPERIMENT_SET
#
# Usage: python3 download_ena_experiment_records.py
#
#
# Note:
# (1) The experiment accession IDs are stored in ../Data/information/experiment_ids_lacking_strategy_or_source.txt
# (2) Given an experiment's accession SRX6711770, we can get its full information (e.g., whether it is RNA-seq) from the following link https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770
# (3) Each run is associated with an experiment. Therefore, we can find whether the run is RNA-seq by first finding its associated experiment accession, then from the experiment
# accession finding its library strategy and library source.
# For example, for run SRR23878410, from https://www.ebi.ac.uk/ena/browser/api/xml/SRR23878410 we know that its associated experiment accession is SRX19690458, and
# from https://www.ebi.ac.uk/ena/browser/api/xml/SRX19690458 we know that its library strategy is RNA-seq and its library source is TRANSCRIPTOMIC.
#
# Last modified by Hui on:
# 2025-10-23
# 2025-10-24
import urllib.request
import os, time, glob
from parse_ena_xml import parse_experiment
from configure import EXPERIMENT_INFO_DIR
def get_xml_for_experiment_id(eid):
url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid)
content = ''
with urllib.request.urlopen(url) as response:
content = response.read()
return content.decode('utf-8')
if __name__ == '__main__':
if not os.path.exists(EXPERIMENT_INFO_DIR):
os.mkdir(EXPERIMENT_INFO_DIR)
already_downloaded_lst = [os.path.split(path)[1] for path in sorted(glob.glob(os.path.join(EXPERIMENT_INFO_DIR, '*')))]
print('%d experiment info has already been downloaded' % (len(already_downloaded_lst)))
with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt') as f: # the file experiment_ids_lacking_strategy_or_source.txt is produced by parse_ena_xml.py
for line in f:
experiment_id = line.strip()
if experiment_id not in already_downloaded_lst:
print(f'Downloading EXPERIMENT info for {experiment_id}')
xml_content = get_xml_for_experiment_id(experiment_id)
fname = os.path.join(EXPERIMENT_INFO_DIR, experiment_id)
with open(fname, 'w', encoding='utf-8') as f2:
f2.write(xml_content)
d = parse_experiment(fname)
print(' ' + d[experiment_id]['library_strategy'])
print(' ' + d[experiment_id]['library_source'])
time.sleep(6)
|