From 0a09eb64a364a1bf67e06c77cce34672beecaec1 Mon Sep 17 00:00:00 2001
From: Lan Hui <lanhui@zjnu.edu.cn>
Date: Wed, 16 Apr 2025 15:48:13 +0800
Subject: Output some stats, beginning with %%

---
 Code/parse_ena_xml.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'Code')

diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index c9faefe..8d45481 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -266,14 +266,13 @@ if __name__ == '__main__':
     # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT.  So each
     # of the following function is call for each type of xml file.  The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702
     d_run        = parse_run(ENA_RECORDS_READ_RUN)                   # RUN
-    print(f'{ENA_RECORDS_READ_RUN}:  {len(d_run)} entries')
+    print(f'%% {ENA_RECORDS_READ_RUN}:  {len(d_run)} entries')
     d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT)     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc)
-    print(f'{ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries')
-    #print(d_experiment['ERX9699060'])
+    print(f'%% {ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries')
     d_sample     = parse_sample(ENA_RECORDS_SAMPLE)                  # SAMPLE
-    print(f'{ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries')
+    print(f'%% {ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries')
     d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY
-    print(f'{ENA_RECORDS_STUDY}:  {len(d_study)} entries')
+    print(f'%% {ENA_RECORDS_STUDY}:  {len(d_study)} entries')
 
     cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
     os.system(cmd)
@@ -311,12 +310,14 @@ if __name__ == '__main__':
 
     # Make a json file as well. this file is used to display rna-seq information in scatterplots.
     json_dict = {}
+    count_transcriptomic = 0
+    count_tissue = 0
     for k in sorted(d_run_keys):
         d = {}
         k2 = d_run[k]['experiment_id']
         k3 = d_experiment[k2]['sample_id'] if k2 in d_experiment else 'SAM_UNKNOWN'
         k4 = d_experiment[k2]['study_id'] if k2 in d_experiment else 'PRJ_UNKNOWN'
-        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = '.'
+        d['tissue'] = d['library_strategy'] = d['library_source'] = d['sample_id'] = ''
         if k2 in d_experiment:
             d['sample_id'] = d_experiment[k2]['sample_id']
             d['tissue'] = get_tissue(k, d_run, k2, d_experiment, k3, d_sample, k4, d_study)
@@ -324,7 +325,13 @@ if __name__ == '__main__':
             d['library_source'] = d_experiment[k2]['library_source']
         d['detail'] = 'TBA'
         json_dict[k] = d
-        print(d)
+        if d['library_source'] == 'TRANSCRIPTOMIC':
+            count_transcriptomic += 1
+            if d['tissue']:
+                count_tissue += 1
+
+    percent = 100*count_tissue/count_transcriptomic
+    print(f'%% RNA-seq: {count_transcriptomic}, of which {count_tissue} having tissue info ({percent} percent)')
 
     fname = '../Data/information/rnaseq_info_database.json.temp'
     with open(fname, 'w') as f:
-- 
cgit v1.2.1