# Purpose: Download missing experiment info one by one into ../Data/information/EXPERIMENT_SET # # Usage: python3 download_ena_experiment_records.py # # # Note: # (1) The experiment accession IDs are stored in ../Data/information/experiment_ids_lacking_strategy_or_source.txt # (2) Given an experiment's accession SRX6711770, we can get its full information (e.g., whether it is RNA-seq) from the following link https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770 # (3) Each run is associated with an experiment. Therefore, we can find whether the run is RNA-seq by first finding its associated experiment accession, then from the experiment # accession finding its library strategy and library source. # For example, for run SRR23878410, from https://www.ebi.ac.uk/ena/browser/api/xml/SRR23878410 we know that its associated experiment accession is SRX19690458, and # from https://www.ebi.ac.uk/ena/browser/api/xml/SRX19690458 we know that its library strategy is RNA-seq and its library source is TRANSCRIPTOMIC. # # Last modified by Hui on: # 2025-10-23 # 2025-10-24 import urllib.request import os, time, glob from parse_ena_xml import parse_experiment from configure import EXPERIMENT_INFO_DIR def get_xml_for_experiment_id(eid): url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid) content = '' with urllib.request.urlopen(url) as response: content = response.read() return content.decode('utf-8') if __name__ == '__main__': if not os.path.exists(EXPERIMENT_INFO_DIR): os.mkdir(EXPERIMENT_INFO_DIR) already_downloaded_lst = [os.path.split(path)[1] for path in sorted(glob.glob(os.path.join(EXPERIMENT_INFO_DIR, '*')))] print('%d experiment info has already been downloaded' % (len(already_downloaded_lst))) with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt') as f: # the file experiment_ids_lacking_strategy_or_source.txt is produced by parse_ena_xml.py for line in f: experiment_id = line.strip() if experiment_id not in already_downloaded_lst: print(f'Downloading EXPERIMENT info for {experiment_id}') xml_content = get_xml_for_experiment_id(experiment_id) fname = os.path.join(EXPERIMENT_INFO_DIR, experiment_id) with open(fname, 'w', encoding='utf-8') as f2: f2.write(xml_content) d = parse_experiment(fname) print(' ' + d[experiment_id]['library_strategy']) print(' ' + d[experiment_id]['library_source']) time.sleep(6)