From c1400ae9dbdd0c0000531f53c5414cd2c40a5f9f Mon Sep 17 00:00:00 2001 From: Lan Hui Date: Wed, 22 Oct 2025 17:14:35 +0800 Subject: Handle the case where given an experiment ID, no corresponding LIBRARY_STRATEGY or LIBRARY_SOURCE could be found. --- Code/download_ena_experiment_records.py | 17 +++++++++++++++++ Code/parse_ena_xml.py | 11 +++++++++++ 2 files changed, 28 insertions(+) create mode 100644 Code/download_ena_experiment_records.py diff --git a/Code/download_ena_experiment_records.py b/Code/download_ena_experiment_records.py new file mode 100644 index 0000000..f3c9126 --- /dev/null +++ b/Code/download_ena_experiment_records.py @@ -0,0 +1,17 @@ +# Given an experiment's accession SRX6711770, get its full information from the following link +# https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770 + +import urllib.request + +def get_xml_for_experiment_id(eid): + url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid) + content = '' + with urllib.request.urlopen(url) as response: + content = response.read() + return content.decode('utf-8') + + + +if __name__ == '__main__': + print(get_xml_for_experiment_id('SRX6711770')) + diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 131670c..f737163 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -430,6 +430,8 @@ if __name__ == '__main__': d_run_keys = d_run.keys() d_run_keys = list(set(d_run_keys)) + count_no_library_strategy_or_source = 0 + experiment_ids_lacking_strategy_or_source = [] for k in sorted(d_run_keys): lst = [k] sample_id = d_run[k]['sample_id'] @@ -455,8 +457,17 @@ if __name__ == '__main__': lst.append(library_strategy) lst.append(library_source) f.write('%s\n' % ('\t'.join(lst))) + if library_strategy == '.' and library_source == '.': + print(f'WARNING: Run {k} Experiment {experiment_id} lacks LIBRARY STRATEGY and LIBRARY SOURCE information.') + count_no_library_strategy_or_source += 1 + experiment_ids_lacking_strategy_or_source.append(experiment_id) f.close() + print('%% %d out of %d runs lack LIBRARY STRATEGY and LIBRARY SOURCE information. That is %4.1f%% percent.' % (count_no_library_strategy_or_source, len(d_run_keys), 100*count_no_library_strategy_or_source/len(d_run_keys))) + + # Write experiment ids that do not have records in ena_read_experiment.xml + with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt', 'w') as f: + f.write('\n'.join(sorted(list(set(experiment_ids_lacking_strategy_or_source))))) # Make a json file as well. this file is used to display rna-seq information in scatterplots. json_dict = {} -- cgit v1.2.1