# Usage: python download_ena_metadata.py # Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters. # # Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py. # # 22 Feb 2017, slcu, hui # 12 Apr 2017, slcu, hui import os, sys TAXID = '3702' # organism Tax ID for arabidopisis. Change it for other organisms. LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy MIN_READ_COUNT = 1000000 # only download for samples having at least this many reads RESULT_LIST = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage def convert_name(s): if s.lower() == 'rna-seq': return 'rnaseq' if s.lower() == 'chip-seq': return 'chipseq' return 'unknownseq' fname_lst = [] for result in RESULT_LIST: cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest print(cmd) os.system(cmd) fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result] fname = '_'.join(fname_components) fname = fname.replace(' ', '_') + '.xml' # remove lines with 'Entry:' cmd = 'sed -i.bak \'/^Entry:/d\' result.xml' os.system(cmd) # rename file cmd = 'mv result.xml %s' % (fname) os.system(cmd) fname_lst.append(fname) cmd = 'rm -f *.xml.bak' os.system(cmd) print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))