Parse ENA records XML files. It seems that XML structure for experiment, study and sample has changed.

author: Hui Lan <lanhui@zjnu.edu.cn> 2025-04-13 16:08:17 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2025-04-13 16:08:17 +0800
commit: 7d161d428463ac865459c251a820d85085a2c5fb (patch)
tree: b334d1d0b44e47b81b1ba6043250429da6c5f559 /Code
parent: 36891c55666c009a3c2e106badd81bf97d971abe (diff)
3 files changed, 40 insertions, 84 deletions
diff --git a/Code/configure.py b/Code/configure.py
index a798e4d..34446c3 100644
--- a/Code/configure.py
+++ b/Code/configure.py
@@ -8,6 +8,11 @@ KMER            = 31
 # From download_and_map.py
 DAILY_MAP_NUMBER = 10   # download this many samples each time.  I have tested the values of 3, 4, 5, 8.
 MIN_FASTQ_FILE_SIZE = 200000000    # in bytes, approximately 200MB
+INFO_DIR = '../Data/information/'
+ENA_RECORDS_READ_RUN = '../Data/information/ena_read_run.xml'
+ENA_RECORDS_READ_EXPERIMENT = '../Data/information/ena_read_experiment.xml'
+ENA_RECORDS_SAMPLE = '../Data/information/ena_sample.xml'
+ENA_RECORDS_STUDY = '../Data/information/ena_study.xml'
 RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json'  # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
 DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
 IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt'  # store SRA IDs with small file size.
diff --git a/Code/download_ena_records.py b/Code/download_ena_records.py
index 9ec7623..842fc52 100644
--- a/Code/download_ena_records.py
+++ b/Code/download_ena_records.py
@@ -5,8 +5,10 @@
 import os
 import sys
 import time
+import shutil
 from configure import TEMP_DIR, UPDATE_NETWORK_LOG_FILE
 from log import write_log_file
+from configure import INFO_DIR
 
 accession = 3702 # arabidopsis
 types = ['read_run', 'read_experiment', 'sample', 'study']
@@ -19,7 +21,14 @@ for t in types:
     os.system(cmd)
     time.sleep(5)
 
-write_log_file('[download_ena_records.py] ENA records updated. Check folder %s' % (TEMP_DIR), UPDATE_NETWORK_LOG_FILE)
+for t in types:
+    fname = os.path.join(TEMP_DIR, 'ena_'+t+'.xml')
+    if os.path.exists(fname):
+        print(f'Move {fname} to {INFO_DIR}')
+        shutil.move(fname, INFO_DIR)
+    time.sleep(5)
+
+write_log_file('[download_ena_records.py] ENA records updated. Check folder %s' % (INFO_DIR), UPDATE_NETWORK_LOG_FILE)
 
 #https://www.ebi.ac.uk/ena/browser/api/xml/links/taxon?accession=3702&result=read_run
 #https://www.ebi.ac.uk/ena/browser/api/xml/links/taxon?accession=3702&result=read_experiment
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
index 4c54bc5..1614d7d 100644
--- a/Code/parse_ena_xml.py
+++ b/Code/parse_ena_xml.py
@@ -28,7 +28,7 @@
 import os, json, re, operator
 import xml.etree.ElementTree
 import sys
-
+from configure import ENA_RECORDS_READ_RUN, ENA_RECORDS_READ_EXPERIMENT, ENA_RECORDS_SAMPLE, ENA_RECORDS_STUDY
 MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file
 
 
@@ -70,15 +70,15 @@ def parse_study(fname):
     d = {}
     root = xml.etree.ElementTree.parse(fname).getroot()
 
-
     for c in root.findall('PROJECT'):
+        primary_id = c.get('accession')
+
         d2 = {}
         acc = c.find('./IDENTIFIERS/SECONDARY_ID')
         if acc != None:
             d2['secondary_id'] = acc.text
         else:
             d2['secondary_id'] = '.'
-        d2['primary_id'] = c.get('accession')
         
         desc = c.find('DESCRIPTION')
         d2['description'] = 'None'
@@ -90,28 +90,8 @@ def parse_study(fname):
         if title != None:
             d2['title'] = title.text
 
-        run_id = ''
-        for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'):
-            s = i.text
-            if 'RR' in s:
-                run_id = s;
-                break
-        lst = run_id.split(',')
-        for x in lst:
-            lst2 = x.split('-')
-            if len(lst2) == 1 and lst2[0] != '':
-                k = lst2[0]
-                d[k] = d2 # k is run id, such as SRR, ERR or DRR
-            elif len(lst2) == 2:
-                ss = lst2[0]
-                ee = lst2[1]
-                first_three_letters = ss[0:3]
-                sz = len(ss) - 3
-                ss_t = int(ss[3:])
-                ee_t = int(ee[3:])
-                for j in range(ss_t, ee_t+1, 1):
-                    k = first_three_letters + str(j).zfill(sz)
-                    d[k] = d2
+        d[primary_id] = d2
+
     return d
 
 
@@ -119,16 +99,16 @@ def parse_sample(fname):
     d = {}
     root = xml.etree.ElementTree.parse(fname).getroot()
 
-
     for c in root.findall('SAMPLE'):
+        primary_id = c.get('accession')
+
         d2 = {}
         acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
         if acc != None:
             d2['external_id'] = acc.text
         else:
             d2['external_id'] = '.'
-        d2['primary_id'] = c.get('accession')
-        
+
         desc = c.find('DESCRIPTION')
         d2['description'] = 'None'
         if desc != None and desc.text != None:
@@ -145,28 +125,8 @@ def parse_sample(fname):
                 tissue_type += i.text + ' '
         d2['tissue'] = tissue_type.strip()
 
-        run_id = ''
-        for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'):
-            s = i.text
-            if 'RR' in s:
-                run_id = s;
-                break
-        lst = run_id.split(',')
-        for x in lst:
-            lst2 = x.split('-') # e.g., SRR520490-SRR520491
-            if len(lst2) == 1 and lst2[0] != '':
-                k = lst2[0]
-                d[k] = d2 # k is run id, such as SRR, ERR or DRR
-            elif len(lst2) == 2:
-                ss = lst2[0]
-                ee = lst2[1]
-                first_three_letters = ss[0:3]
-                sz = len(ss) - 3
-                ss_t = int(ss[3:])
-                ee_t = int(ee[3:])
-                for j in range(ss_t, ee_t+1, 1):
-                    k = first_three_letters + str(j).zfill(sz)
-                    d[k] = d2
+        d[primary_id] = d2
+
     return d
 
 
@@ -176,8 +136,9 @@ def parse_experiment(fname):
     root = xml.etree.ElementTree.parse(fname).getroot()
     
     for c in root.findall('EXPERIMENT'):
+        primary_id = c.get('accession')
+
         d2 = {}
-        d2['primary_id'] = c.get('accession')
         
         title = c.find('TITLE')
         d2['title'] = 'None'
@@ -198,30 +159,9 @@ def parse_experiment(fname):
         d2['library_source'] = 'None!'
         if source != None and source.text != None:
             d2['library_source'] = source.text
-            
-        
-        run_id = ''
-        for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'):
-            s = i.text
-            if 'RR' in s:
-                run_id = s;
-                break
-        lst = run_id.split(',')
-        for x in lst:
-            lst2 = x.split('-') # e.g., SRR520490-SRR520491
-            if len(lst2) == 1 and lst2[0] != '':
-                k = lst2[0]
-                d[k] = d2 # k is run id, such as SRR, ERR or DRR
-            elif len(lst2) == 2:
-                ss = lst2[0]
-                ee = lst2[1]
-                first_three_letters = ss[0:3]
-                sz = len(ss) - 3
-                ss_t = int(ss[3:])
-                ee_t = int(ee[3:])
-                for j in range(ss_t, ee_t+1, 1):
-                    k = first_three_letters + str(j).zfill(sz)
-                    d[k] = d2
+
+        d[primary_id] = d2
+
     return d
 
 
@@ -258,19 +198,21 @@ def get_tissue(s):
     return result.rstrip(';')
 
 
-
-
-
 ## main
 if __name__ == '__main__':
     
     # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT.  So each
     # of the following function is call for each type of xml file.  The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702
-    d_run        = parse_run('../Data/information/ena_3702_read_run.xml')                   # RUN
-    d_sample     = parse_sample('../Data/information/ena_3702_sample.xml')                  # SAMPLE
-    d_study      = parse_study('../Data/information/ena_3702_read_study.xml')               # STUDY
-    d_experiment = parse_experiment('../Data/information/ena_3702_read_experiment.xml')     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc)
-    
+    d_run        = parse_run(ENA_RECORDS_READ_RUN)                   # RUN
+    print(f'{ENA_RECORDS_READ_RUN}:  {len(d_run)} entries')
+    d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT)     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc)
+    print(f'{ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries')
+    #print(d_experiment['ERX9699060'])
+    d_sample     = parse_sample(ENA_RECORDS_SAMPLE)                  # SAMPLE
+    print(f'{ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries')
+    d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY
+    print(f'{ENA_RECORDS_STUDY}:  {len(d_study)} entries')
+
     cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
     os.system(cmd)
author	Hui Lan <lanhui@zjnu.edu.cn>	2025-04-13 16:08:17 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2025-04-13 16:08:17 +0800
commit	7d161d428463ac865459c251a820d85085a2c5fb (patch)
tree	b334d1d0b44e47b81b1ba6043250429da6c5f559 /Code
parent	36891c55666c009a3c2e106badd81bf97d971abe (diff)