diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2025-04-13 16:08:17 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2025-04-13 16:08:17 +0800 |
commit | 7d161d428463ac865459c251a820d85085a2c5fb (patch) | |
tree | b334d1d0b44e47b81b1ba6043250429da6c5f559 /Code/parse_ena_xml.py | |
parent | 36891c55666c009a3c2e106badd81bf97d971abe (diff) |
Parse ENA records XML files. It seems that XML structure for experiment, study and sample has changed.
Diffstat (limited to 'Code/parse_ena_xml.py')
-rw-r--r-- | Code/parse_ena_xml.py | 108 |
1 files changed, 25 insertions, 83 deletions
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 4c54bc5..1614d7d 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -28,7 +28,7 @@ import os, json, re, operator import xml.etree.ElementTree import sys - +from configure import ENA_RECORDS_READ_RUN, ENA_RECORDS_READ_EXPERIMENT, ENA_RECORDS_SAMPLE, ENA_RECORDS_STUDY MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file @@ -70,15 +70,15 @@ def parse_study(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() - for c in root.findall('PROJECT'): + primary_id = c.get('accession') + d2 = {} acc = c.find('./IDENTIFIERS/SECONDARY_ID') if acc != None: d2['secondary_id'] = acc.text else: d2['secondary_id'] = '.' - d2['primary_id'] = c.get('accession') desc = c.find('DESCRIPTION') d2['description'] = 'None' @@ -90,28 +90,8 @@ def parse_study(fname): if title != None: d2['title'] = title.text - run_id = '' - for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'): - s = i.text - if 'RR' in s: - run_id = s; - break - lst = run_id.split(',') - for x in lst: - lst2 = x.split('-') - if len(lst2) == 1 and lst2[0] != '': - k = lst2[0] - d[k] = d2 # k is run id, such as SRR, ERR or DRR - elif len(lst2) == 2: - ss = lst2[0] - ee = lst2[1] - first_three_letters = ss[0:3] - sz = len(ss) - 3 - ss_t = int(ss[3:]) - ee_t = int(ee[3:]) - for j in range(ss_t, ee_t+1, 1): - k = first_three_letters + str(j).zfill(sz) - d[k] = d2 + d[primary_id] = d2 + return d @@ -119,16 +99,16 @@ def parse_sample(fname): d = {} root = xml.etree.ElementTree.parse(fname).getroot() - for c in root.findall('SAMPLE'): + primary_id = c.get('accession') + d2 = {} acc = c.find('./IDENTIFIERS/EXTERNAL_ID') if acc != None: d2['external_id'] = acc.text else: d2['external_id'] = '.' - d2['primary_id'] = c.get('accession') - + desc = c.find('DESCRIPTION') d2['description'] = 'None' if desc != None and desc.text != None: @@ -145,28 +125,8 @@ def parse_sample(fname): tissue_type += i.text + ' ' d2['tissue'] = tissue_type.strip() - run_id = '' - for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'): - s = i.text - if 'RR' in s: - run_id = s; - break - lst = run_id.split(',') - for x in lst: - lst2 = x.split('-') # e.g., SRR520490-SRR520491 - if len(lst2) == 1 and lst2[0] != '': - k = lst2[0] - d[k] = d2 # k is run id, such as SRR, ERR or DRR - elif len(lst2) == 2: - ss = lst2[0] - ee = lst2[1] - first_three_letters = ss[0:3] - sz = len(ss) - 3 - ss_t = int(ss[3:]) - ee_t = int(ee[3:]) - for j in range(ss_t, ee_t+1, 1): - k = first_three_letters + str(j).zfill(sz) - d[k] = d2 + d[primary_id] = d2 + return d @@ -176,8 +136,9 @@ def parse_experiment(fname): root = xml.etree.ElementTree.parse(fname).getroot() for c in root.findall('EXPERIMENT'): + primary_id = c.get('accession') + d2 = {} - d2['primary_id'] = c.get('accession') title = c.find('TITLE') d2['title'] = 'None' @@ -198,30 +159,9 @@ def parse_experiment(fname): d2['library_source'] = 'None!' if source != None and source.text != None: d2['library_source'] = source.text - - - run_id = '' - for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'): - s = i.text - if 'RR' in s: - run_id = s; - break - lst = run_id.split(',') - for x in lst: - lst2 = x.split('-') # e.g., SRR520490-SRR520491 - if len(lst2) == 1 and lst2[0] != '': - k = lst2[0] - d[k] = d2 # k is run id, such as SRR, ERR or DRR - elif len(lst2) == 2: - ss = lst2[0] - ee = lst2[1] - first_three_letters = ss[0:3] - sz = len(ss) - 3 - ss_t = int(ss[3:]) - ee_t = int(ee[3:]) - for j in range(ss_t, ee_t+1, 1): - k = first_three_letters + str(j).zfill(sz) - d[k] = d2 + + d[primary_id] = d2 + return d @@ -258,19 +198,21 @@ def get_tissue(s): return result.rstrip(';') - - - ## main if __name__ == '__main__': # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT. So each # of the following function is call for each type of xml file. The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 - d_run = parse_run('../Data/information/ena_3702_read_run.xml') # RUN - d_sample = parse_sample('../Data/information/ena_3702_sample.xml') # SAMPLE - d_study = parse_study('../Data/information/ena_3702_read_study.xml') # STUDY - d_experiment = parse_experiment('../Data/information/ena_3702_read_experiment.xml') # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) - + d_run = parse_run(ENA_RECORDS_READ_RUN) # RUN + print(f'{ENA_RECORDS_READ_RUN}: {len(d_run)} entries') + d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT) # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) + print(f'{ENA_RECORDS_READ_EXPERIMENT}: {len(d_experiment)} entries') + #print(d_experiment['ERX9699060']) + d_sample = parse_sample(ENA_RECORDS_SAMPLE) # SAMPLE + print(f'{ENA_RECORDS_SAMPLE}: {len(d_sample)} entries') + d_study = parse_study(ENA_RECORDS_STUDY) # STUDY + print(f'{ENA_RECORDS_STUDY}: {len(d_study)} entries') + cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection os.system(cmd) |