brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/download_ena_metadata.py
1 files changed, 43 insertions, 0 deletions
diff --git a/Code/download_ena_metadata.py b/Code/download_ena_metadata.py
new file mode 100644
index 0000000..c88bbfc
--- /dev/null
+++ b/Code/download_ena_metadata.py
@@ -0,0 +1,43 @@
+# Usage: python download_ena_metadata.py
+#        Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters.
+#
+# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+
+import os, sys
+
+TAXID            = '3702'    # organism Tax ID for arabidopisis. Change it for other organisms.
+LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy
+MIN_READ_COUNT   = 1000000   # only download for samples having at least this many reads
+RESULT_LIST      = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage
+
+
+def convert_name(s):
+    if s.lower() == 'rna-seq':
+        return 'rnaseq'
+    if s.lower() == 'chip-seq':
+        return 'chipseq'
+    return 'unknownseq'
+
+
+fname_lst = []
+for result in RESULT_LIST:
+    cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest
+    print(cmd)
+    os.system(cmd)    
+    fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result]
+    fname = '_'.join(fname_components)
+    fname = fname.replace(' ', '_') + '.xml'
+    # remove lines with 'Entry:'
+    cmd = 'sed -i.bak \'/^Entry:/d\' result.xml'
+    os.system(cmd)
+    # rename file
+    cmd = 'mv result.xml %s' % (fname)
+    os.system(cmd)
+    fname_lst.append(fname)
+
+cmd = 'rm -f *.xml.bak'
+os.system(cmd)
+print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/download_ena_metadata.py