summaryrefslogtreecommitdiff
path: root/Code/download_ena_metadata.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/download_ena_metadata.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/download_ena_metadata.py')
-rw-r--r--Code/download_ena_metadata.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/Code/download_ena_metadata.py b/Code/download_ena_metadata.py
new file mode 100644
index 0000000..c88bbfc
--- /dev/null
+++ b/Code/download_ena_metadata.py
@@ -0,0 +1,43 @@
+# Usage: python download_ena_metadata.py
+# Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters.
+#
+# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+
+import os, sys
+
+TAXID = '3702' # organism Tax ID for arabidopisis. Change it for other organisms.
+LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy
+MIN_READ_COUNT = 1000000 # only download for samples having at least this many reads
+RESULT_LIST = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage
+
+
+def convert_name(s):
+ if s.lower() == 'rna-seq':
+ return 'rnaseq'
+ if s.lower() == 'chip-seq':
+ return 'chipseq'
+ return 'unknownseq'
+
+
+fname_lst = []
+for result in RESULT_LIST:
+ cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest
+ print(cmd)
+ os.system(cmd)
+ fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result]
+ fname = '_'.join(fname_components)
+ fname = fname.replace(' ', '_') + '.xml'
+ # remove lines with 'Entry:'
+ cmd = 'sed -i.bak \'/^Entry:/d\' result.xml'
+ os.system(cmd)
+ # rename file
+ cmd = 'mv result.xml %s' % (fname)
+ os.system(cmd)
+ fname_lst.append(fname)
+
+cmd = 'rm -f *.xml.bak'
+os.system(cmd)
+print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))