Get more complete tissue information from ena_sample.xml

author: Lan Hui <lanhui@zjnu.edu.cn> 2025-06-03 16:50:29 +0800
committer: Lan Hui <lanhui@zjnu.edu.cn> 2025-06-03 16:50:29 +0800
commit: def5d8276577805f5c37bb5e694572646c71a120 (patch)
tree: fa3e2d2eef8c060b02eeaf5b1d7e476c957dcef3
parent: c1dcc3ecf65c081468794eeb134c5fdad6fb4081 (diff)
1 files changed, 24 insertions, 4 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index f0ddfb3..cbabc3c 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -193,6 +193,12 @@ def parse_sample(fname):
         else:
             d2['external_id'] = '.'
 
+        acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+        if acc != None:
+            secondary_id = acc.text
+        else:
+            secondary_id = 'None'
+
         desc = c.find('DESCRIPTION')
         d2['description'] = 'None'
         if desc != None and desc.text != None:
@@ -214,6 +220,7 @@ def parse_sample(fname):
         d2['tissue'] = clean_tissue_info(tissue_type) # remove space, lower letters, and remove punctuations
 
         d[primary_id] = d2
+        d[secondary_id] = d2
 
     return d
 
@@ -286,7 +293,7 @@ def parse_experiment(fname):
         if desc != None and desc.text != None:
             d2['description'] = desc.text
 
-        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/EXTERNAL_ID')
+        sample = c.find('./DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS/PRIMARY_ID')
         d2['sample_id'] = 'None'
         if sample != None and sample.text != None:
             d2['sample_id'] = sample.text
@@ -407,6 +414,7 @@ if __name__ == '__main__':
     d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY
     print(f'%% {ENA_RECORDS_STUDY}:  {len(d_study)} entries')
 
+
     cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
     os.system(cmd)
 
@@ -450,20 +458,30 @@ if __name__ == '__main__':
     json_dict = {}
     count_transcriptomic = 0
     count_tissue = 0
+    count_no_sample_id = 0
     for k in sorted(d_run_keys):
+        # k - run id, k2 - experiment id, k3 = sample id, k4 - study id
         d = {}
         k2 = d_run[k]['experiment_id']
         d['experiment_id'] = k2
         d['tissue'] = ''
         d['sample_id'] = d['study_id'] = d['library_strategy'] = d['library_source'] = d['detail']  = ''
+        k3 = k4 = 'None'
         if k2 in d_experiment:
             k3 = d_experiment[k2]['sample_id']
+            if k3 == 'None':
+                k3 = d_run[k]['sample_id']
+                count_no_sample_id += 1
             k4 = d_experiment[k2]['study_id']
-            d['sample_id'] = k3
-            d['study_id'] = k4
-            d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
             d['library_strategy'] = d_experiment[k2]['library_strategy']
             d['library_source'] = d_experiment[k2]['library_source']
+        else:
+            k3 = d_run[k]['sample_id']
+
+        d['sample_id'] = k3
+        d['study_id'] = k4
+        d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
+
         json_dict[k] = d
         if d['library_source'] == 'TRANSCRIPTOMIC':
             count_transcriptomic += 1
@@ -473,6 +491,8 @@ if __name__ == '__main__':
     percent = 100*count_tissue/count_transcriptomic
     print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')
 
+    print(f'%% Sample id not in d_experiment count: {count_no_sample_id}')
+
     temp_fname = RNA_SEQ_INFO_DATABASE_JSON + '.temp'
     with open(temp_fname, 'w') as f:
         json.dump(json_dict, f, indent=4)
author	Lan Hui <lanhui@zjnu.edu.cn>	2025-06-03 16:50:29 +0800
committer	Lan Hui <lanhui@zjnu.edu.cn>	2025-06-03 16:50:29 +0800
commit	def5d8276577805f5c37bb5e694572646c71a120 (patch)
tree	fa3e2d2eef8c060b02eeaf5b1d7e476c957dcef3
parent	c1dcc3ecf65c081468794eeb134c5fdad6fb4081 (diff)