Code/download_ena_experiment_records.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

# Purpose: Download missing experiment info one by one into ../Data/information/EXPERIMENT_SET
#
# Usage: python3 download_ena_experiment_records.py
#
#
# Note:
# (1) The experiment accession IDs are stored in ../Data/information/experiment_ids_lacking_strategy_or_source.txt
# (2) Given an experiment's accession SRX6711770, we can get its full information from the following link
#     https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770
#
# 2025-10-23

import urllib.request
import os, time, glob

def get_xml_for_experiment_id(eid):
    url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid)
    content = ''
    with urllib.request.urlopen(url) as response:
        content = response.read()
    return content.decode('utf-8')


if __name__ == '__main__':
    experiment_info_dir = '../Data/information/EXPERIMENT_SET'
    if not os.path.exists(experiment_info_dir):
        os.mkdir(experiment_info_dir)
    already_downloaded_lst = [os.path.split(path)[1] for path in sorted(glob.glob(os.path.join(experiment_info_dir, '*')))]
    print('%d experiment info has already been downloaded' % (len(already_downloaded_lst)))
    with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt') as f: # the file experiment_ids_lacking_strategy_or_source.txt is produced by parse_ena_records.py
        for line in f:
            experiment_id = line.strip()
            if experiment_id not in already_downloaded_lst:
                print(f'Downloading EXPERIMENT info for {experiment_id}')
                xml_content = get_xml_for_experiment_id(experiment_id)
                fname = os.path.join(experiment_info_dir, experiment_id)
                with open(fname, 'w', encoding='utf-8') as f2:
                    f2.write(xml_content)
                time.sleep(6)