Code/download_ena_experiment_records.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

# Purpose: Download missing experiment info one by one into ../Data/information/EXPERIMENT_SET
#
# Usage: python3 download_ena_experiment_records.py
#
#
# Note:
# (1) The experiment accession IDs are stored in ../Data/information/experiment_ids_lacking_strategy_or_source.txt
# (2) Given an experiment's accession SRX6711770, we can get its full information (e.g., whether it is RNA-seq) from the following link https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770
# (3) Each run is associated with an experiment. Therefore, we can find whether the run is RNA-seq by first finding its associated experiment accession, then from the experiment
#     accession finding its library strategy and library source.
#     For example, for run SRR23878410, from https://www.ebi.ac.uk/ena/browser/api/xml/SRR23878410 we know that its associated experiment accession is SRX19690458, and
#     from https://www.ebi.ac.uk/ena/browser/api/xml/SRX19690458 we know that its library strategy is RNA-seq and its library source is TRANSCRIPTOMIC.
#
# Last modified by Hui on:
#   2025-10-23
#   2025-10-24

import urllib.request
import os, time, glob
from parse_ena_xml import parse_experiment
from configure import EXPERIMENT_INFO_DIR

def get_xml_for_experiment_id(eid):
    url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid)
    content = ''
    with urllib.request.urlopen(url) as response:
        content = response.read()
    return content.decode('utf-8')

if __name__ == '__main__':
    if not os.path.exists(EXPERIMENT_INFO_DIR):
        os.mkdir(EXPERIMENT_INFO_DIR)
    already_downloaded_lst = [os.path.split(path)[1] for path in sorted(glob.glob(os.path.join(EXPERIMENT_INFO_DIR, '*')))]
    print('%d experiment info has already been downloaded' % (len(already_downloaded_lst)))
    with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt') as f: # the file experiment_ids_lacking_strategy_or_source.txt is produced by parse_ena_xml.py
        for line in f:
            experiment_id = line.strip()
            if experiment_id not in already_downloaded_lst:
                print(f'Downloading EXPERIMENT info for {experiment_id}')
                xml_content = get_xml_for_experiment_id(experiment_id)
                fname = os.path.join(EXPERIMENT_INFO_DIR, experiment_id)
                with open(fname, 'w', encoding='utf-8') as f2:
                    f2.write(xml_content)
                d = parse_experiment(fname)
                print('  ' +  d[experiment_id]['library_strategy'])
                print('  ' + d[experiment_id]['library_source'])
                time.sleep(6)