blob: 7d955ebde1830b2c979d2df2cd70a38d9a4e4219 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
#
# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
#
# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
#
# 13 Apr 2017, slcu, hui
import os, json, re, operator, sys
import xml.etree.ElementTree
MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
def parse_run(fname):
d = {}
root = xml.etree.ElementTree.parse(fname).getroot()
for c in root.findall('RUN'):
acc = c.get('accession')
d[acc] = {}
alias = c.get('alias')
d[acc]['alias'] = alias
experiment = c.find('EXPERIMENT_REF').get('accession')
d[acc]['experiment_id'] = experiment
title = c.find('TITLE').text
d[acc]['title'] = title
d[acc]['study_id'] = '.'
for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
s = i.text
#print(s)
if 'RP' in s: # run project
d[acc]['study_id'] = s
break
d[acc]['sample_id'] = '.'
for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
s = i.text
if 'RS' in s: # run project
d[acc]['sample_id'] = s
break
return d
def get_key(s):
if '_' in s:
return s[:s.find('_')]
else:
return s[:s.find('.')]
def make_downloaded_dict(fname):
f = open(fname)
d = {}
for line in f:
line = line.strip()
if line != '' and not line.startswith('#'):
if line.startswith('-'):
fn = line.split(' ')[-1]
else:
fn = os.path.basename(line)
k = get_key(fn)
if not k in d:
d[k] = [fn]
else:
d[k].append(fn)
d[k] = sorted(list(set(d[k])))
return d
## main
cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
os.system(cmd)
d = make_downloaded_dict(sys.argv[1])
# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
# of the following function is for each type of xml file.
d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN
for k in sorted(d.keys()):
s = k + '\t' + ' '.join(d[k])
if k in d_rnaseq_run:
s += '\t' + 'RNA-seq'
elif k in d_chipseq_run:
s += '\t' + 'ChIP-seq'
else:
s += '\t' + '.'
print(s)
|