Code/rnaseq_or_chipseq.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
#
# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
#
# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
#
# 13 Apr 2017, slcu, hui

import os, json, re, operator, sys
import xml.etree.ElementTree

MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file

def parse_run(fname):
    d = {}
    
    root = xml.etree.ElementTree.parse(fname).getroot()
    
    for c in root.findall('RUN'):
        acc = c.get('accession')
        d[acc] = {}
        
        alias = c.get('alias')
        d[acc]['alias'] = alias
        
        experiment = c.find('EXPERIMENT_REF').get('accession')
        d[acc]['experiment_id'] = experiment
        
        title = c.find('TITLE').text
        d[acc]['title'] = title
        
        d[acc]['study_id'] = '.'
        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
            s = i.text
            #print(s)
            if 'RP' in s: # run project
                d[acc]['study_id'] = s
                break
        d[acc]['sample_id'] = '.'
        for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
            s = i.text
            if 'RS' in s: # run project
                d[acc]['sample_id'] = s
                break

    return d

def get_key(s):
    if '_' in s:
        return s[:s.find('_')]
    else:
        return s[:s.find('.')]
    
def make_downloaded_dict(fname):
    f = open(fname)
    d = {}
    for line in f:
        line = line.strip()
        if line != '' and not line.startswith('#'):
            if line.startswith('-'):
                fn = line.split(' ')[-1]
            else:
                fn = os.path.basename(line)                
            k = get_key(fn)
            if not k in d:
                d[k] = [fn]
            else:
                d[k].append(fn)
                d[k] = sorted(list(set(d[k])))
    return d

## main

cmd = 'export PYTHONIOENCODING=UTF-8'  # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
os.system(cmd)

d = make_downloaded_dict(sys.argv[1])

# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT.  So each
# of the following function is for each type of xml file.
d_rnaseq_run    = parse_run('../Data/information/ena_rnaseq_read_run.xml')   # RUN
d_chipseq_run   = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN

for k in sorted(d.keys()):
    s = k + '\t' + ' '.join(d[k])
    if k in d_rnaseq_run:
        s += '\t' + 'RNA-seq'
    elif k in d_chipseq_run:
        s += '\t' + 'ChIP-seq'
    else:
        s += '\t' + '.'
    print(s)