diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/download_ena_metadata.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/download_ena_metadata.py')
-rw-r--r-- | Code/download_ena_metadata.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/Code/download_ena_metadata.py b/Code/download_ena_metadata.py new file mode 100644 index 0000000..c88bbfc --- /dev/null +++ b/Code/download_ena_metadata.py @@ -0,0 +1,43 @@ +# Usage: python download_ena_metadata.py +# Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters. +# +# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py. +# +# 22 Feb 2017, slcu, hui +# 12 Apr 2017, slcu, hui + +import os, sys + +TAXID = '3702' # organism Tax ID for arabidopisis. Change it for other organisms. +LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy +MIN_READ_COUNT = 1000000 # only download for samples having at least this many reads +RESULT_LIST = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage + + +def convert_name(s): + if s.lower() == 'rna-seq': + return 'rnaseq' + if s.lower() == 'chip-seq': + return 'chipseq' + return 'unknownseq' + + +fname_lst = [] +for result in RESULT_LIST: + cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest + print(cmd) + os.system(cmd) + fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result] + fname = '_'.join(fname_components) + fname = fname.replace(' ', '_') + '.xml' + # remove lines with 'Entry:' + cmd = 'sed -i.bak \'/^Entry:/d\' result.xml' + os.system(cmd) + # rename file + cmd = 'mv result.xml %s' % (fname) + os.system(cmd) + fname_lst.append(fname) + +cmd = 'rm -f *.xml.bak' +os.system(cmd) +print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst))) |