Futher improve XML processing logic

author: Hui Lan <lanhui@zjnu.edu.cn> 2025-04-14 18:51:17 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2025-04-14 18:51:17 +0800
commit: b0a477d6f8a888832ae339593cf4bd0ef05a23df (patch)
tree: 2b0c5954dfa00c4372b761435e5821b3df290d47 /Code/parse_ena_xml.py
parent: 7d161d428463ac865459c251a820d85085a2c5fb (diff)
1 files changed, 58 insertions, 26 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 1614d7d..1bc2862 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -150,6 +150,11 @@ def parse_experiment(fname):
         if desc != None and desc.text != None:
             d2['description'] = desc.text
 
+        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID/')
+        d2['sample_id'] = 'None'
+        if sample != None and sample.text != None:
+            d2['sample_id'] = sample.text
+
         strategy = c.find('./DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_STRATEGY')
         d2['library_strategy'] = 'None'  # we look for RNA-Seq
         if strategy != None and strategy.text != None:
@@ -198,6 +203,13 @@ def get_tissue(s):
     return result.rstrip(';')
 
 
+def get_tissue2(sample_id, d):
+    tissue = '.'
+    if sample_id in d:
+        tissue = d[sample_id]['tissue']
+    return tissue
+
+
 ## main
 if __name__ == '__main__':
     
@@ -221,6 +233,52 @@ if __name__ == '__main__':
     d_run_keys = d_run.keys()
     d_run_keys = list(set(d_run_keys))
     
+    for k in sorted(d_run_keys):
+        lst = [k]
+        sample_id = d_run[k]['sample_id']
+        experiment_id = d_run[k]['experiment_id']
+        study_id = d_run[k]['study_id']
+        study_id_PRJ = '.'
+        title = d_run[k]['title']
+        alias = d_run[k]['alias']
+        description = '.'
+        library_strategy = '.'
+        library_source = '.'
+        if experiment_id in d_experiment:
+            description = d_experiment[experiment_id]['description']
+            library_strategy = d_experiment[experiment_id]['library_strategy']
+            library_source = d_experiment[experiment_id]['library_source']
+        lst.append(sample_id)
+        lst.append(experiment_id)
+        lst.append(study_id)
+        lst.append(study_id_PRJ)
+        lst.append(title)
+        lst.append(alias)
+        lst.append(description)
+        lst.append(library_strategy)
+        lst.append(library_source)
+        print('%s' % ('\t'.join(lst)))
+
+    # Make a json file as well. this file is used to display rna-seq information in scatterplots.
+    json_dict = {}
+    for k in sorted(d_run_keys):
+        d = {}
+        k2 = d_run[k]['experiment_id']
+        d['tissue'] = d['library_strategy'] = d['library_source'] = '.'
+        if k2 in d_experiment:
+            d['tissue'] = get_tissue2(d_experiment[k2]['sample_id'], d_sample)
+            d['library_strategy'] = d_experiment[k2]['library_strategy']
+            d['library_source'] = d_experiment[k2]['library_source']
+        d['detail'] = 'TBA'
+        json_dict[k] = d
+        print(d)
+
+    fname = '../Data/information/rnaseq_info_database.json.temp'
+    with open(fname, 'w') as f:
+        json.dump(json_dict, f, indent=4)
+
+
+    sys.exit()
     # Collect information for each run ID
     for k in sorted(d_run_keys):
         lst = []
@@ -265,29 +323,3 @@ if __name__ == '__main__':
             lst.append(d_experiment[k]['library_source'])            
             
         print('%s' % ('\t'.join(lst)))
-        
-
-    # Make a json file as well. this file is used to display rna-seq information in scatterplots.
-    json_dict = {}
-    for k in sorted(d_run_keys):
-        if k in d_run:
-            s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '.  More info:'
-            if k in d_study:
-                s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description']
-            if k in d_sample:
-                s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description']
-            if k in d_experiment:
-                s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description']
-    
-            s = s.strip()
-            d = {}
-            d['tissue'] = get_tissue(s)
-            d['library_strategy'] = d_experiment[k]['library_strategy']
-            d['library_source'] = d_experiment[k]['library_source']            
-            d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
-            
-        json_dict[k] = d
-    
-    fname = '../Data/information/rnaseq_info_database.json.temp'
-    with open(fname, 'w') as f:
-        json.dump(json_dict, f, indent=4)
author	Hui Lan <lanhui@zjnu.edu.cn>	2025-04-14 18:51:17 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2025-04-14 18:51:17 +0800
commit	b0a477d6f8a888832ae339593cf4bd0ef05a23df (patch)
tree	2b0c5954dfa00c4372b761435e5821b3df290d47 /Code/parse_ena_xml.py
parent	7d161d428463ac865459c251a820d85085a2c5fb (diff)