summaryrefslogtreecommitdiff
path: root/Code
diff options
context:
space:
mode:
authorLan Hui <lanhui@zjnu.edu.cn>2025-10-22 17:14:35 +0800
committerLan Hui <lanhui@zjnu.edu.cn>2025-10-22 17:14:35 +0800
commitc1400ae9dbdd0c0000531f53c5414cd2c40a5f9f (patch)
treecd635b100e5014489cb43fb2f298dbc25c2a56e2 /Code
parent75fb4625afc73439c6e29dedcb8d7a49099f4023 (diff)
Handle the case where given an experiment ID, no corresponding LIBRARY_STRATEGY or LIBRARY_SOURCE could be found.
Diffstat (limited to 'Code')
-rw-r--r--Code/download_ena_experiment_records.py17
-rw-r--r--Code/parse_ena_xml.py11
2 files changed, 28 insertions, 0 deletions
diff --git a/Code/download_ena_experiment_records.py b/Code/download_ena_experiment_records.py
new file mode 100644
index 0000000..f3c9126
--- /dev/null
+++ b/Code/download_ena_experiment_records.py
@@ -0,0 +1,17 @@
+# Given an experiment's accession SRX6711770, get its full information from the following link
+# https://www.ebi.ac.uk/ena/browser/api/xml/SRX6711770
+
+import urllib.request
+
+def get_xml_for_experiment_id(eid):
+ url = 'https://www.ebi.ac.uk/ena/browser/api/xml/%s' % (eid)
+ content = ''
+ with urllib.request.urlopen(url) as response:
+ content = response.read()
+ return content.decode('utf-8')
+
+
+
+if __name__ == '__main__':
+ print(get_xml_for_experiment_id('SRX6711770'))
+
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 131670c..f737163 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -430,6 +430,8 @@ if __name__ == '__main__':
d_run_keys = d_run.keys()
d_run_keys = list(set(d_run_keys))
+ count_no_library_strategy_or_source = 0
+ experiment_ids_lacking_strategy_or_source = []
for k in sorted(d_run_keys):
lst = [k]
sample_id = d_run[k]['sample_id']
@@ -455,8 +457,17 @@ if __name__ == '__main__':
lst.append(library_strategy)
lst.append(library_source)
f.write('%s\n' % ('\t'.join(lst)))
+ if library_strategy == '.' and library_source == '.':
+ print(f'WARNING: Run {k} Experiment {experiment_id} lacks LIBRARY STRATEGY and LIBRARY SOURCE information.')
+ count_no_library_strategy_or_source += 1
+ experiment_ids_lacking_strategy_or_source.append(experiment_id)
f.close()
+ print('%% %d out of %d runs lack LIBRARY STRATEGY and LIBRARY SOURCE information. That is %4.1f%% percent.' % (count_no_library_strategy_or_source, len(d_run_keys), 100*count_no_library_strategy_or_source/len(d_run_keys)))
+
+ # Write experiment ids that do not have records in ena_read_experiment.xml
+ with open('../Data/information/experiment_ids_lacking_strategy_or_source.txt', 'w') as f:
+ f.write('\n'.join(sorted(list(set(experiment_ids_lacking_strategy_or_source)))))
# Make a json file as well. this file is used to display rna-seq information in scatterplots.
json_dict = {}