diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/rnaseq_or_chipseq.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/rnaseq_or_chipseq.py')
-rw-r--r-- | Code/rnaseq_or_chipseq.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py new file mode 100644 index 0000000..7d955eb --- /dev/null +++ b/Code/rnaseq_or_chipseq.py @@ -0,0 +1,92 @@ +# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt +# +# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py. +# +# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8. +# +# 13 Apr 2017, slcu, hui + +import os, json, re, operator, sys +import xml.etree.ElementTree + +MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file + +def parse_run(fname): + d = {} + + root = xml.etree.ElementTree.parse(fname).getroot() + + for c in root.findall('RUN'): + acc = c.get('accession') + d[acc] = {} + + alias = c.get('alias') + d[acc]['alias'] = alias + + experiment = c.find('EXPERIMENT_REF').get('accession') + d[acc]['experiment_id'] = experiment + + title = c.find('TITLE').text + d[acc]['title'] = title + + d[acc]['study_id'] = '.' + for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): + s = i.text + #print(s) + if 'RP' in s: # run project + d[acc]['study_id'] = s + break + d[acc]['sample_id'] = '.' + for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): + s = i.text + if 'RS' in s: # run project + d[acc]['sample_id'] = s + break + + return d + +def get_key(s): + if '_' in s: + return s[:s.find('_')] + else: + return s[:s.find('.')] + +def make_downloaded_dict(fname): + f = open(fname) + d = {} + for line in f: + line = line.strip() + if line != '' and not line.startswith('#'): + if line.startswith('-'): + fn = line.split(' ')[-1] + else: + fn = os.path.basename(line) + k = get_key(fn) + if not k in d: + d[k] = [fn] + else: + d[k].append(fn) + d[k] = sorted(list(set(d[k]))) + return d + +## main + +cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection +os.system(cmd) + +d = make_downloaded_dict(sys.argv[1]) + +# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each +# of the following function is for each type of xml file. +d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN +d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN + +for k in sorted(d.keys()): + s = k + '\t' + ' '.join(d[k]) + if k in d_rnaseq_run: + s += '\t' + 'RNA-seq' + elif k in d_chipseq_run: + s += '\t' + 'ChIP-seq' + else: + s += '\t' + '.' + print(s) |