1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
|
# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
# Edit QUANT_PATH, set NON_ZERO_RATIO.
#
# Purpose: automatically generate parameter_for_buildRmatrix.txt
#
# Update: 26 Feb 2017, slcu, hui
# Update: 19 Sep 2019, slcu, hui [add read_ena_data_info_json]
import sys, os, glob, json
import fnmatch, re
from datetime import datetime
from configure import RNA_SEQ_INFO_FILE, UPDATE_NETWORK_LOG_FILE
NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.
def get_quant_file_list(fname):
f = open(fname)
result = []
for line in f:
line = line.strip()
lst = line.split()
result.append(lst[0])
f.close()
return result
def extract_id(s, src):
if src == 'inhouse' or src == 'other':
dirname = os.path.dirname(s)
lst = dirname.split('/')
parent_dir = lst[-1]
libno = filter(str.isdigit, parent_dir) # extrac library number
libno = libno.zfill(3)
sample_no = os.path.basename(s) # extract sample number
first_match = re.findall('_S\d+_', sample_no)[0]
sample_no = filter(str.isdigit, first_match).zfill(2)
return '0000' + libno + sample_no
if src == 'sra':
index = s.find('_quant')
if index > 0:
return s[:index]
else:
return 'NA'
return 'NA'
def zfill2(s, n):
return s + 'X' * (n-len(s))
def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
result = 'R' + success_flag + lab_id + zfill2(myid, 9)
else: # inhouse or other
result = 'R' + success_flag + lab_id + myid
return result
def glob_files_include_path(directory, pattern):
''' return all file names (with paths) given directory and pattern '''
result = []
for root, dirnames, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, pattern):
result.append(os.path.join(root, filename))
return result
def glob_files_include_paths(directory_lst, pattern):
''' return all file names (with paths) given directory and pattern '''
result = []
for directory in directory_lst:
result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
return result
def non_zero_ratio(fname):
non_zero_count = 0
total_count = 0
f = open(fname)
lines = f.readlines()
f.close()
for line in lines[1:]:
line = line.strip()
lst = line.split()
if len(lst) < 4: # this should not occur. Report error if occurred.
return -1
tpm = lst[3]
if not tpm == '0' and not 'nan' in tpm:
non_zero_count += 1
total_count += 1
return 1.0 * non_zero_count / total_count
def get_rna_word_count(s):
''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
count = 0
s = s.lower()
if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
count = 1
return count
def read_ena_data_info_json(fname):
d = {}
with open(fname) as json_data:
json_dict = json.load(json_data)
for run_id in json_dict:
d[run_id] = 1
return d
def write_log_file(s, fname):
f = open(fname, 'a')
curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
s = '[' + curr_time + ']: ' + s
if not '\n' in s:
s += '\n'
f.write(s)
f.close()
### main
if not os.path.exists(RNA_SEQ_INFO_FILE):
print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
sys.exit()
rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)
print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head
if len(sys.argv) > 1: # only get these samples specified in id-list.txt
quant_files = get_quant_file_list(sys.argv[1])
else:
quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')
include_count = 0
total = 0
already_added_dict = {}
for fn in sorted(quant_files):
total += 1
nzr = non_zero_ratio(fn)
if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
fn2 = os.path.basename(fn)
data_name = 'None'
if 'inhouse' in fn:
myid = extract_id(fn, 'inhouse')
myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
data_name = 'LAB001' + '_LIB' + myid[-5:]
desc = 'in-house'
include_me = True
elif 'public' in fn:
myid = extract_id(fn2, 'sra')
myid2 = make_id('0',myid[0:3], myid[3:])
data_name = myid
desc = 'SRA'
include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
elif 'other' in fn:
myid = extract_id(fn, 'other')
# get lab id
dirname = os.path.dirname(fn)
lst = dirname.split('/')
lab_id = lst[-2]
myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
desc = 'Other lab'
include_me = True
if include_me and not myid2 in already_added_dict:
print('@%s' % (myid2))
print('DATA_NAME:%s' % (data_name))
print('DATA_FORMAT:%s'% ('txt'))
print('DESCRIPTION:%s' % (desc))
print('LOCATION:%s' % (fn))
print('NOTE: non zero ratio is %4.2f' % (nzr))
print('')
include_count += 1
already_added_dict[myid2] = 'yes'
elif nzr < 0:
write_log_file('[make_parameter_rnaseq.py] Warning: incomplete line in file %s' % (fn), UPDATE_NETWORK_LOG_FILE)
else:
#print('%s has too many zeroes. ignore.' % (fn))
pass
print('#Done. Processed %d files. Included %d files.' % (total, include_count))
|