Code/download_ena_metadata.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

# Usage: python download_ena_metadata.py
#        Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters.
#
# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py.
#
# 22 Feb 2017, slcu, hui
# 12 Apr 2017, slcu, hui

import os, sys

TAXID            = '3702'    # organism Tax ID for arabidopisis. Change it for other organisms.
LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy
MIN_READ_COUNT   = 1000000   # only download for samples having at least this many reads
RESULT_LIST      = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage


def convert_name(s):
    if s.lower() == 'rna-seq':
        return 'rnaseq'
    if s.lower() == 'chip-seq':
        return 'chipseq'
    return 'unknownseq'


fname_lst = []
for result in RESULT_LIST:
    cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest
    print(cmd)
    os.system(cmd)    
    fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result]
    fname = '_'.join(fname_components)
    fname = fname.replace(' ', '_') + '.xml'
    # remove lines with 'Entry:'
    cmd = 'sed -i.bak \'/^Entry:/d\' result.xml'
    os.system(cmd)
    # rename file
    cmd = 'mv result.xml %s' % (fname)
    os.system(cmd)
    fname_lst.append(fname)

cmd = 'rm -f *.xml.bak'
os.system(cmd)
print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))