diff options
| author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 | 
|---|---|---|
| committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 | 
| commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
| tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/rnaseq_or_chipseq.py | |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/rnaseq_or_chipseq.py')
| -rw-r--r-- | Code/rnaseq_or_chipseq.py | 92 | 
1 files changed, 92 insertions, 0 deletions
| diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py new file mode 100644 index 0000000..7d955eb --- /dev/null +++ b/Code/rnaseq_or_chipseq.py @@ -0,0 +1,92 @@ +# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt +# +# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py. +# +# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8. +# +# 13 Apr 2017, slcu, hui + +import os, json, re, operator, sys +import xml.etree.ElementTree + +MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file + +def parse_run(fname): +    d = {} +     +    root = xml.etree.ElementTree.parse(fname).getroot() +     +    for c in root.findall('RUN'): +        acc = c.get('accession') +        d[acc] = {} +         +        alias = c.get('alias') +        d[acc]['alias'] = alias +         +        experiment = c.find('EXPERIMENT_REF').get('accession') +        d[acc]['experiment_id'] = experiment +         +        title = c.find('TITLE').text +        d[acc]['title'] = title +         +        d[acc]['study_id'] = '.' +        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): +            s = i.text +            #print(s) +            if 'RP' in s: # run project +                d[acc]['study_id'] = s +                break +        d[acc]['sample_id'] = '.' +        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'): +            s = i.text +            if 'RS' in s: # run project +                d[acc]['sample_id'] = s +                break + +    return d + +def get_key(s): +    if '_' in s: +        return s[:s.find('_')] +    else: +        return s[:s.find('.')] +     +def make_downloaded_dict(fname): +    f = open(fname) +    d = {} +    for line in f: +        line = line.strip() +        if line != '' and not line.startswith('#'): +            if line.startswith('-'): +                fn = line.split(' ')[-1] +            else: +                fn = os.path.basename(line)                 +            k = get_key(fn) +            if not k in d: +                d[k] = [fn] +            else: +                d[k].append(fn) +                d[k] = sorted(list(set(d[k]))) +    return d + +## main + +cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection +os.system(cmd) + +d = make_downloaded_dict(sys.argv[1]) + +# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT.  So each +# of the following function is for each type of xml file. +d_rnaseq_run    = parse_run('../Data/information/ena_rnaseq_read_run.xml')   # RUN +d_chipseq_run   = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN + +for k in sorted(d.keys()): +    s = k + '\t' + ' '.join(d[k]) +    if k in d_rnaseq_run: +        s += '\t' + 'RNA-seq' +    elif k in d_chipseq_run: +        s += '\t' + 'ChIP-seq' +    else: +        s += '\t' + '.' +    print(s) | 
