Retrieve more tissue information from ena_study.xml

author: Lan Hui <lanhui@zjnu.edu.cn> 2025-04-16 15:26:22 +0800
committer: Lan Hui <lanhui@zjnu.edu.cn> 2025-04-16 15:26:22 +0800
commit: ada33aee714635628d9bdab3cba9f3ec3f2f92a4 (patch)
tree: bb42c6ca75050a77f36ef61d927b8881415406d2 /Code/parse_ena_xml.py
parent: fb2b2e547139739e183a797d4f092974ed82ae00 (diff)
1 files changed, 18 insertions, 3 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 0055ec5..7122a73 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -144,6 +144,11 @@ def parse_experiment(fname):
         primary_id = c.get('accession')
 
         d2 = {}
+
+        study = c.find('./STUDY_REF/IDENTIFIERS/SECONDARY_ID')
+        d2['study_id'] = 'None'
+        if study != None and study.text != None:
+            d2['study_id'] = study.text
         
         title = c.find('TITLE')
         d2['title'] = 'None'
@@ -194,7 +199,7 @@ def get_singular_form(w):
     return w
 
 
-def get_tissue(sample_id, d_sample, experiment_id, d_experiment):
+def get_tissue(run_id, d_run, sample_id, d_sample, experiment_id, d_experiment, study_id, d_study):
     ''' Extract tissue name from s.  s may contain several tissue names, return them ordered by frequency.  '''
 
     tissue = ''
@@ -209,9 +214,17 @@ def get_tissue(sample_id, d_sample, experiment_id, d_experiment):
         s += ' ' + d_sample[sample_id]['description']
 
     if experiment_id in d_experiment:
+        s += ' ' + d_experiment[experiment_id]['title']
         s += ' ' + d_experiment[experiment_id]['protocol']
         s += ' ' + d_experiment[experiment_id]['attribute']
 
+    if run_id in d_run:
+        s += ' ' + d_run[run_id]['title']
+
+    if study_id in d_study:
+        s += ' ' + d_study[study_id]['title']
+        s += ' ' + d_study[study_id]['description']
+
     lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath', 'microspore'] # possible tissue names, lower case.  refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
 
     # build a count dictionary, where key is a word
@@ -300,11 +313,13 @@ if __name__ == '__main__':
     json_dict = {}
     for k in sorted(d_run_keys):
         d = {}
-        k2 = d_run[k]['experiment_id']
+        k2 = d_experiment[k2]['sample_id']
+        k3 = d_run[k]['experiment_id']
+        k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN'
         d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.'
         if k2 in d_experiment:
             d['sample_id'] = d_experiment[k2]['sample_id']
-            d['tissue'] = get_tissue(d_experiment[k2]['sample_id'], d_sample, k2, d_experiment)
+            d['tissue'] = get_tissue(k, d_run,  k2, d_sample, k3, d_experiment, k4, d_study)
             d['library_strategy'] = d_experiment[k2]['library_strategy']
             d['library_source'] = d_experiment[k2]['library_source']
         d['detail'] = 'TBA'
author	Lan Hui <lanhui@zjnu.edu.cn>	2025-04-16 15:26:22 +0800
committer	Lan Hui <lanhui@zjnu.edu.cn>	2025-04-16 15:26:22 +0800
commit	ada33aee714635628d9bdab3cba9f3ec3f2f92a4 (patch)
tree	bb42c6ca75050a77f36ef61d927b8881415406d2 /Code/parse_ena_xml.py
parent	fb2b2e547139739e183a797d4f092974ed82ae00 (diff)