brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/rnaseq_or_chipseq.py
1 files changed, 92 insertions, 0 deletions
diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py
new file mode 100644
index 0000000..7d955eb
--- /dev/null
+++ b/Code/rnaseq_or_chipseq.py
@@ -0,0 +1,92 @@
+# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
+#
+# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
+#
+# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
+#
+# 13 Apr 2017, slcu, hui
+
+import os, json, re, operator, sys
+import xml.etree.ElementTree
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_run(fname):
+    d = {}
+    
+    root = xml.etree.ElementTree.parse(fname).getroot()
+    
+    for c in root.findall('RUN'):
+        acc = c.get('accession')
+        d[acc] = {}
+        
+        alias = c.get('alias')
+        d[acc]['alias'] = alias
+        
+        experiment = c.find('EXPERIMENT_REF').get('accession')
+        d[acc]['experiment_id'] = experiment
+        
+        title = c.find('TITLE').text
+        d[acc]['title'] = title
+        
+        d[acc]['study_id'] = '.'
+        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+            s = i.text
+            #print(s)
+            if 'RP' in s: # run project
+                d[acc]['study_id'] = s
+                break
+        d[acc]['sample_id'] = '.'
+        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+            s = i.text
+            if 'RS' in s: # run project
+                d[acc]['sample_id'] = s
+                break
+
+    return d
+
+def get_key(s):
+    if '_' in s:
+        return s[:s.find('_')]
+    else:
+        return s[:s.find('.')]
+    
+def make_downloaded_dict(fname):
+    f = open(fname)
+    d = {}
+    for line in f:
+        line = line.strip()
+        if line != '' and not line.startswith('#'):
+            if line.startswith('-'):
+                fn = line.split(' ')[-1]
+            else:
+                fn = os.path.basename(line)                
+            k = get_key(fn)
+            if not k in d:
+                d[k] = [fn]
+            else:
+                d[k].append(fn)
+                d[k] = sorted(list(set(d[k])))
+    return d
+
+## main
+
+cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d = make_downloaded_dict(sys.argv[1])
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT.  So each
+# of the following function is for each type of xml file.
+d_rnaseq_run    = parse_run('../Data/information/ena_rnaseq_read_run.xml')   # RUN
+d_chipseq_run   = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN
+
+for k in sorted(d.keys()):
+    s = k + '\t' + ' '.join(d[k])
+    if k in d_rnaseq_run:
+        s += '\t' + 'RNA-seq'
+    elif k in d_chipseq_run:
+        s += '\t' + 'ChIP-seq'
+    else:
+        s += '\t' + '.'
+    print(s)
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/rnaseq_or_chipseq.py