summaryrefslogtreecommitdiff
path: root/Code/rnaseq_or_chipseq.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/rnaseq_or_chipseq.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/rnaseq_or_chipseq.py')
-rw-r--r--Code/rnaseq_or_chipseq.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py
new file mode 100644
index 0000000..7d955eb
--- /dev/null
+++ b/Code/rnaseq_or_chipseq.py
@@ -0,0 +1,92 @@
+# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
+#
+# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
+#
+# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
+#
+# 13 Apr 2017, slcu, hui
+
+import os, json, re, operator, sys
+import xml.etree.ElementTree
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+def get_key(s):
+ if '_' in s:
+ return s[:s.find('_')]
+ else:
+ return s[:s.find('.')]
+
+def make_downloaded_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ if line.startswith('-'):
+ fn = line.split(' ')[-1]
+ else:
+ fn = os.path.basename(line)
+ k = get_key(fn)
+ if not k in d:
+ d[k] = [fn]
+ else:
+ d[k].append(fn)
+ d[k] = sorted(list(set(d[k])))
+ return d
+
+## main
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d = make_downloaded_dict(sys.argv[1])
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
+d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN
+
+for k in sorted(d.keys()):
+ s = k + '\t' + ' '.join(d[k])
+ if k in d_rnaseq_run:
+ s += '\t' + 'RNA-seq'
+ elif k in d_chipseq_run:
+ s += '\t' + 'ChIP-seq'
+ else:
+ s += '\t' + '.'
+ print(s)