Handle the case where given an experiment ID, no corresponding LIBRARY_STRATEGY or LIBRARY_SOURCE could be found.

author: Lan Hui <lanhui@zjnu.edu.cn> 2025-10-22 17:14:35 +0800
committer: Lan Hui <lanhui@zjnu.edu.cn> 2025-10-22 17:14:35 +0800
commit: c1400ae9dbdd0c0000531f53c5414cd2c40a5f9f (patch)
tree: cd635b100e5014489cb43fb2f298dbc25c2a56e2 /Code
parent: 75fb4625afc73439c6e29dedcb8d7a49099f4023 (diff)
2 files changed, 28 insertions, 0 deletions
diff --git a/Code/download_ena_experiment_records.py b/Code/download_ena_experiment_records.py
new file mode 100644
index 0000000..f3c9126
--- /dev/null
+++ b/Code/download_ena_experiment_records.py
@@ -0,0 +1,17 @@
+# Given an experiment's accession SRX6711770, get its full information from the following link
+# https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770
+
+import urllib.request
+
+def get_xml_for_experiment_id(eid):
+    url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid)
+    content = ''
+    with urllib.request.urlopen(url) as response:
+        content = response.read()
+    return content.decode('utf-8')
+
+
+
+if __name__ == '__main__':
+    print(get_xml_for_experiment_id('SRX6711770'))
+
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 131670c..f737163 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -430,6 +430,8 @@ if __name__ == '__main__':
     d_run_keys = d_run.keys()
     d_run_keys = list(set(d_run_keys))
     
+    count_no_library_strategy_or_source = 0
+    experiment_ids_lacking_strategy_or_source = []
     for k in sorted(d_run_keys):
         lst = [k]
         sample_id = d_run[k]['sample_id']
@@ -455,8 +457,17 @@ if __name__ == '__main__':
         lst.append(library_strategy)
         lst.append(library_source)
         f.write('%s\n' % ('\t'.join(lst)))
+        if library_strategy  == '.' and library_source == '.':
+            print(f'WARNING: Run {k} Experiment {experiment_id} lacks LIBRARY STRATEGY and LIBRARY SOURCE information.')
+            count_no_library_strategy_or_source += 1
+            experiment_ids_lacking_strategy_or_source.append(experiment_id)
 
     f.close()
+    print('%% %d out of %d runs lack LIBRARY STRATEGY and LIBRARY SOURCE information. That is %4.1f%% percent.' % (count_no_library_strategy_or_source, len(d_run_keys), 100*count_no_library_strategy_or_source/len(d_run_keys)))
+
+    # Write experiment ids that do not have records in ena_read_experiment.xml
+    with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt', 'w') as f:
+        f.write('\n'.join(sorted(list(set(experiment_ids_lacking_strategy_or_source)))))
 
     # Make a json file as well. this file is used to display rna-seq information in scatterplots.
     json_dict = {}
author	Lan Hui <lanhui@zjnu.edu.cn>	2025-10-22 17:14:35 +0800
committer	Lan Hui <lanhui@zjnu.edu.cn>	2025-10-22 17:14:35 +0800
commit	c1400ae9dbdd0c0000531f53c5414cd2c40a5f9f (patch)
tree	cd635b100e5014489cb43fb2f298dbc25c2a56e2 /Code
parent	75fb4625afc73439c6e29dedcb8d7a49099f4023 (diff)