summaryrefslogtreecommitdiff
path: root/Code/parse_ena_xml.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2025-04-14 18:51:17 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2025-04-14 18:51:17 +0800
commitb0a477d6f8a888832ae339593cf4bd0ef05a23df (patch)
tree2b0c5954dfa00c4372b761435e5821b3df290d47 /Code/parse_ena_xml.py
parent7d161d428463ac865459c251a820d85085a2c5fb (diff)
Futher improve XML processing logic
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r--Code/parse_ena_xml.py84
1 files changed, 58 insertions, 26 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 1614d7d..1bc2862 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -150,6 +150,11 @@ def parse_experiment(fname):
if desc != None and desc.text != None:
d2['description'] = desc.text
+ sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID/')
+ d2['sample_id'] = 'None'
+ if sample != None and sample.text != None:
+ d2['sample_id'] = sample.text
+
strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY')
d2['library_strategy'] = 'None' # we look for RNA-Seq
if strategy != None and strategy.text != None:
@@ -198,6 +203,13 @@ def get_tissue(s):
return result.rstrip(';')
+def get_tissue2(sample_id, d):
+ tissue = '.'
+ if sample_id in d:
+ tissue = d[sample_id]['tissue']
+ return tissue
+
+
## main
if __name__ == '__main__':
@@ -221,6 +233,52 @@ if __name__ == '__main__':
d_run_keys = d_run.keys()
d_run_keys = list(set(d_run_keys))
+ for k in sorted(d_run_keys):
+ lst = [k]
+ sample_id = d_run[k]['sample_id']
+ experiment_id = d_run[k]['experiment_id']
+ study_id = d_run[k]['study_id']
+ study_id_PRJ = '.'
+ title = d_run[k]['title']
+ alias = d_run[k]['alias']
+ description = '.'
+ library_strategy = '.'
+ library_source = '.'
+ if experiment_id in d_experiment:
+ description = d_experiment[experiment_id]['description']
+ library_strategy = d_experiment[experiment_id]['library_strategy']
+ library_source = d_experiment[experiment_id]['library_source']
+ lst.append(sample_id)
+ lst.append(experiment_id)
+ lst.append(study_id)
+ lst.append(study_id_PRJ)
+ lst.append(title)
+ lst.append(alias)
+ lst.append(description)
+ lst.append(library_strategy)
+ lst.append(library_source)
+ print('%s' % ('\t'.join(lst)))
+
+ # Make a json file as well. this file is used to display rna-seq information in scatterplots.
+ json_dict = {}
+ for k in sorted(d_run_keys):
+ d = {}
+ k2 = d_run[k]['experiment_id']
+ d['tissue'] = d['library_strategy'] = d['library_source'] = '.'
+ if k2 in d_experiment:
+ d['tissue'] = get_tissue2(d_experiment[k2]['sample_id'], d_sample)
+ d['library_strategy'] = d_experiment[k2]['library_strategy']
+ d['library_source'] = d_experiment[k2]['library_source']
+ d['detail'] = 'TBA'
+ json_dict[k] = d
+ print(d)
+
+ fname = '../Data/information/rnaseq_info_database.json.temp'
+ with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+
+ sys.exit()
# Collect information for each run ID
for k in sorted(d_run_keys):
lst = []
@@ -265,29 +323,3 @@ if __name__ == '__main__':
lst.append(d_experiment[k]['library_source'])
print('%s' % ('\t'.join(lst)))
-
-
- # Make a json file as well. this file is used to display rna-seq information in scatterplots.
- json_dict = {}
- for k in sorted(d_run_keys):
- if k in d_run:
- s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:'
- if k in d_study:
- s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description']
- if k in d_sample:
- s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description']
- if k in d_experiment:
- s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description']
-
- s = s.strip()
- d = {}
- d['tissue'] = get_tissue(s)
- d['library_strategy'] = d_experiment[k]['library_strategy']
- d['library_source'] = d_experiment[k]['library_source']
- d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
-
- json_dict[k] = d
-
- fname = '../Data/information/rnaseq_info_database.json.temp'
- with open(fname, 'w') as f:
- json.dump(json_dict, f, indent=4)