diff options
Diffstat (limited to 'Code')
| -rw-r--r-- | Code/configure.py | 5 | ||||
| -rw-r--r-- | Code/download_ena_records.py | 11 | ||||
| -rw-r--r-- | Code/parse_ena_xml.py | 108 | 
3 files changed, 40 insertions, 84 deletions
| diff --git a/Code/configure.py b/Code/configure.py index a798e4d..34446c3 100644 --- a/Code/configure.py +++ b/Code/configure.py @@ -8,6 +8,11 @@ KMER            = 31  # From download_and_map.py  DAILY_MAP_NUMBER = 10   # download this many samples each time.  I have tested the values of 3, 4, 5, 8.  MIN_FASTQ_FILE_SIZE = 200000000    # in bytes, approximately 200MB +INFO_DIR = '../Data/information/' +ENA_RECORDS_READ_RUN = '../Data/information/ena_read_run.xml' +ENA_RECORDS_READ_EXPERIMENT = '../Data/information/ena_read_experiment.xml' +ENA_RECORDS_SAMPLE = '../Data/information/ena_sample.xml' +ENA_RECORDS_STUDY = '../Data/information/ena_study.xml'  RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json'  # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq  DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs  IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt'  # store SRA IDs with small file size. diff --git a/Code/download_ena_records.py b/Code/download_ena_records.py index 9ec7623..842fc52 100644 --- a/Code/download_ena_records.py +++ b/Code/download_ena_records.py @@ -5,8 +5,10 @@  import os  import sys  import time +import shutil  from configure import TEMP_DIR, UPDATE_NETWORK_LOG_FILE  from log import write_log_file +from configure import INFO_DIR  accession = 3702 # arabidopsis  types = ['read_run', 'read_experiment', 'sample', 'study'] @@ -19,7 +21,14 @@ for t in types:      os.system(cmd)      time.sleep(5) -write_log_file('[download_ena_records.py] ENA records updated. Check folder %s' % (TEMP_DIR), UPDATE_NETWORK_LOG_FILE) +for t in types: +    fname = os.path.join(TEMP_DIR, 'ena_'+t+'.xml') +    if os.path.exists(fname): +        print(f'Move {fname} to {INFO_DIR}') +        shutil.move(fname, INFO_DIR) +    time.sleep(5) + +write_log_file('[download_ena_records.py] ENA records updated. Check folder %s' % (INFO_DIR), UPDATE_NETWORK_LOG_FILE)  #https://www.ebi.ac.uk/ena/browser/api/xml/links/taxon?accession=3702&result=read_run  #https://www.ebi.ac.uk/ena/browser/api/xml/links/taxon?accession=3702&result=read_experiment diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py index 4c54bc5..1614d7d 100644 --- a/Code/parse_ena_xml.py +++ b/Code/parse_ena_xml.py @@ -28,7 +28,7 @@  import os, json, re, operator  import xml.etree.ElementTree  import sys - +from configure import ENA_RECORDS_READ_RUN, ENA_RECORDS_READ_EXPERIMENT, ENA_RECORDS_SAMPLE, ENA_RECORDS_STUDY  MAX_DESCRIPTION_LENGTH = 6000 # max number to characters to keep in json file @@ -70,15 +70,15 @@ def parse_study(fname):      d = {}      root = xml.etree.ElementTree.parse(fname).getroot() -      for c in root.findall('PROJECT'): +        primary_id = c.get('accession') +          d2 = {}          acc = c.find('./IDENTIFIERS/SECONDARY_ID')          if acc != None:              d2['secondary_id'] = acc.text          else:              d2['secondary_id'] = '.' -        d2['primary_id'] = c.get('accession')          desc = c.find('DESCRIPTION')          d2['description'] = 'None' @@ -90,28 +90,8 @@ def parse_study(fname):          if title != None:              d2['title'] = title.text -        run_id = '' -        for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'): -            s = i.text -            if 'RR' in s: -                run_id = s; -                break -        lst = run_id.split(',') -        for x in lst: -            lst2 = x.split('-') -            if len(lst2) == 1 and lst2[0] != '': -                k = lst2[0] -                d[k] = d2 # k is run id, such as SRR, ERR or DRR -            elif len(lst2) == 2: -                ss = lst2[0] -                ee = lst2[1] -                first_three_letters = ss[0:3] -                sz = len(ss) - 3 -                ss_t = int(ss[3:]) -                ee_t = int(ee[3:]) -                for j in range(ss_t, ee_t+1, 1): -                    k = first_three_letters + str(j).zfill(sz) -                    d[k] = d2 +        d[primary_id] = d2 +      return d @@ -119,16 +99,16 @@ def parse_sample(fname):      d = {}      root = xml.etree.ElementTree.parse(fname).getroot() -      for c in root.findall('SAMPLE'): +        primary_id = c.get('accession') +          d2 = {}          acc = c.find('./IDENTIFIERS/EXTERNAL_ID')          if acc != None:              d2['external_id'] = acc.text          else:              d2['external_id'] = '.' -        d2['primary_id'] = c.get('accession') -         +          desc = c.find('DESCRIPTION')          d2['description'] = 'None'          if desc != None and desc.text != None: @@ -145,28 +125,8 @@ def parse_sample(fname):                  tissue_type += i.text + ' '          d2['tissue'] = tissue_type.strip() -        run_id = '' -        for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'): -            s = i.text -            if 'RR' in s: -                run_id = s; -                break -        lst = run_id.split(',') -        for x in lst: -            lst2 = x.split('-') # e.g., SRR520490-SRR520491 -            if len(lst2) == 1 and lst2[0] != '': -                k = lst2[0] -                d[k] = d2 # k is run id, such as SRR, ERR or DRR -            elif len(lst2) == 2: -                ss = lst2[0] -                ee = lst2[1] -                first_three_letters = ss[0:3] -                sz = len(ss) - 3 -                ss_t = int(ss[3:]) -                ee_t = int(ee[3:]) -                for j in range(ss_t, ee_t+1, 1): -                    k = first_three_letters + str(j).zfill(sz) -                    d[k] = d2 +        d[primary_id] = d2 +      return d @@ -176,8 +136,9 @@ def parse_experiment(fname):      root = xml.etree.ElementTree.parse(fname).getroot()      for c in root.findall('EXPERIMENT'): +        primary_id = c.get('accession') +          d2 = {} -        d2['primary_id'] = c.get('accession')          title = c.find('TITLE')          d2['title'] = 'None' @@ -198,30 +159,9 @@ def parse_experiment(fname):          d2['library_source'] = 'None!'          if source != None and source.text != None:              d2['library_source'] = source.text -             -         -        run_id = '' -        for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'): -            s = i.text -            if 'RR' in s: -                run_id = s; -                break -        lst = run_id.split(',') -        for x in lst: -            lst2 = x.split('-') # e.g., SRR520490-SRR520491 -            if len(lst2) == 1 and lst2[0] != '': -                k = lst2[0] -                d[k] = d2 # k is run id, such as SRR, ERR or DRR -            elif len(lst2) == 2: -                ss = lst2[0] -                ee = lst2[1] -                first_three_letters = ss[0:3] -                sz = len(ss) - 3 -                ss_t = int(ss[3:]) -                ee_t = int(ee[3:]) -                for j in range(ss_t, ee_t+1, 1): -                    k = first_three_letters + str(j).zfill(sz) -                    d[k] = d2 + +        d[primary_id] = d2 +      return d @@ -258,19 +198,21 @@ def get_tissue(s):      return result.rstrip(';') - - -  ## main  if __name__ == '__main__':      # ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, SAMPLE, EXPERIMENT.  So each      # of the following function is call for each type of xml file.  The input files were downloaded from https://www.ebi.ac.uk/ena/browser/view/Taxon:3702 -    d_run        = parse_run('../Data/information/ena_3702_read_run.xml')                   # RUN -    d_sample     = parse_sample('../Data/information/ena_3702_sample.xml')                  # SAMPLE -    d_study      = parse_study('../Data/information/ena_3702_read_study.xml')               # STUDY -    d_experiment = parse_experiment('../Data/information/ena_3702_read_experiment.xml')     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) -     +    d_run        = parse_run(ENA_RECORDS_READ_RUN)                   # RUN +    print(f'{ENA_RECORDS_READ_RUN}:  {len(d_run)} entries') +    d_experiment = parse_experiment(ENA_RECORDS_READ_EXPERIMENT)     # EXPERIMENT, including library strategy (RNA-Seq, WSG, etc) and library source (TRANSCRIPTIOMIC, GENOMIC, etc) +    print(f'{ENA_RECORDS_READ_EXPERIMENT}:  {len(d_experiment)} entries') +    #print(d_experiment['ERX9699060']) +    d_sample     = parse_sample(ENA_RECORDS_SAMPLE)                  # SAMPLE +    print(f'{ENA_RECORDS_SAMPLE}:  {len(d_sample)} entries') +    d_study      = parse_study(ENA_RECORDS_STUDY)                    # STUDY +    print(f'{ENA_RECORDS_STUDY}:  {len(d_study)} entries') +      cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection      os.system(cmd) | 
