1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# Usage: python download_ena_metadata.py
# Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters.
#
# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py.
#
# 22 Feb 2017, slcu, hui
# 12 Apr 2017, slcu, hui
import os, sys
TAXID = '3702' # organism Tax ID for arabidopisis. Change it for other organisms.
LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy
MIN_READ_COUNT = 1000000 # only download for samples having at least this many reads
RESULT_LIST = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage
def convert_name(s):
if s.lower() == 'rna-seq':
return 'rnaseq'
if s.lower() == 'chip-seq':
return 'chipseq'
return 'unknownseq'
fname_lst = []
for result in RESULT_LIST:
cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest
print(cmd)
os.system(cmd)
fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result]
fname = '_'.join(fname_components)
fname = fname.replace(' ', '_') + '.xml'
# remove lines with 'Entry:'
cmd = 'sed -i.bak \'/^Entry:/d\' result.xml'
os.system(cmd)
# rename file
cmd = 'mv result.xml %s' % (fname)
os.system(cmd)
fname_lst.append(fname)
cmd = 'rm -f *.xml.bak'
os.system(cmd)
print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))
|