diff options
| author | Lan Hui <lanhui@zjnu.edu.cn> | 2025-10-22 17:14:35 +0800 | 
|---|---|---|
| committer | Lan Hui <lanhui@zjnu.edu.cn> | 2025-10-22 17:14:35 +0800 | 
| commit | c1400ae9dbdd0c0000531f53c5414cd2c40a5f9f (patch) | |
| tree | cd635b100e5014489cb43fb2f298dbc25c2a56e2 /Code | |
| parent | 75fb4625afc73439c6e29dedcb8d7a49099f4023 (diff) | |
Handle the case where given an experiment ID, no corresponding LIBRARY_STRATEGY or LIBRARY_SOURCE could be found.
Diffstat (limited to 'Code')
| -rw-r--r-- | Code/download_ena_experiment_records.py | 17 | ||||
| -rw-r--r-- | Code/parse_ena_xml.py | 11 | 
2 files changed, 28 insertions, 0 deletions
| diff --git a/Code/download_ena_experiment_records.py b/Code/download_ena_experiment_records.py new file mode 100644 index 0000000..f3c9126 --- /dev/null +++ b/Code/download_ena_experiment_records.py @@ -0,0 +1,17 @@ +# Given an experiment's accession SRX6711770, get its full information from the following link +# https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770 + +import urllib.request + +def get_xml_for_experiment_id(eid): +    url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid) +    content = '' +    with urllib.request.urlopen(url) as response: +        content = response.read() +    return content.decode('utf-8') + + + +if __name__ == '__main__': +    print(get_xml_for_experiment_id('SRX6711770')) + diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 131670c..f737163 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -430,6 +430,8 @@ if __name__ == '__main__':      d_run_keys = d_run.keys()      d_run_keys = list(set(d_run_keys)) +    count_no_library_strategy_or_source = 0 +    experiment_ids_lacking_strategy_or_source = []      for k in sorted(d_run_keys):          lst = [k]          sample_id = d_run[k]['sample_id'] @@ -455,8 +457,17 @@ if __name__ == '__main__':          lst.append(library_strategy)          lst.append(library_source)          f.write('%s\n' % ('\t'.join(lst))) +        if library_strategy  == '.' and library_source == '.': +            print(f'WARNING: Run {k} Experiment {experiment_id} lacks LIBRARY STRATEGY and LIBRARY SOURCE information.') +            count_no_library_strategy_or_source += 1 +            experiment_ids_lacking_strategy_or_source.append(experiment_id)      f.close() +    print('%% %d out of %d runs lack LIBRARY STRATEGY and LIBRARY SOURCE information. That is %4.1f%% percent.' % (count_no_library_strategy_or_source, len(d_run_keys), 100*count_no_library_strategy_or_source/len(d_run_keys))) + +    # Write experiment ids that do not have records in ena_read_experiment.xml +    with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt', 'w') as f: +        f.write('\n'.join(sorted(list(set(experiment_ids_lacking_strategy_or_source)))))      # Make a json file as well. this file is used to display rna-seq information in scatterplots.      json_dict = {} | 
