Code/make_parameter_rnaseq.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
#        Edit QUANT_PATH, set NON_ZERO_RATIO.
#
# Purpose: automatically generate parameter_for_buildRmatrix.txt
#
# Update: 26 Feb 2017, slcu, hui
# Update: 19 Sep 2019, slcu, hui [add  read_ena_data_info_json]

import sys, os, glob, json
import fnmatch, re
from datetime import datetime
from configure import RNA_SEQ_INFO_FILE, UPDATE_NETWORK_LOG_FILE

NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other']  # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.

def get_quant_file_list(fname):
    f = open(fname)
    result = []
    for line in f:
        line = line.strip()
        lst = line.split()
        result.append(lst[0])
    f.close()
    return result

def extract_id(s, src):
    if src == 'inhouse' or src == 'other':
        dirname = os.path.dirname(s)
        lst = dirname.split('/')
        parent_dir = lst[-1]
        libno = filter(str.isdigit, parent_dir) # extrac library number
        libno = libno.zfill(3)
        sample_no = os.path.basename(s) # extract sample number
        first_match = re.findall('_S\d+_', sample_no)[0]
        sample_no = filter(str.isdigit, first_match).zfill(2)
        return '0000' + libno + sample_no
    if src == 'sra':
        index = s.find('_quant')
        if index > 0:
            return s[:index]
        else:
            return 'NA'
    return 'NA'

def zfill2(s, n):
    return s + 'X' * (n-len(s))
    
def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
    if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
        result = 'R' + success_flag + lab_id + zfill2(myid, 9)
    else: # inhouse or other
        result = 'R' + success_flag + lab_id + myid
    return result
    
def glob_files_include_path(directory, pattern):
    ''' return all file names (with paths) given directory and pattern '''
    result = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            result.append(os.path.join(root, filename))
    return result

def glob_files_include_paths(directory_lst, pattern):
    ''' return all file names (with paths) given directory and pattern '''
    result = []
    for directory in directory_lst:
        result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
    return result

def non_zero_ratio(fname):
    non_zero_count = 0
    total_count = 0
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines[1:]:
        line = line.strip()
        lst = line.split()
        if len(lst) < 4: # this should not occur. Report error if occurred.
            return -1
        tpm = lst[3]
        if not tpm == '0' and not 'nan' in tpm:
            non_zero_count += 1
        total_count += 1
    return 1.0 * non_zero_count / total_count

def get_rna_word_count(s):
    ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
    count = 0
    s = s.lower()
    if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
        count = 1
    return count

def read_ena_data_info_json(fname):
    d = {}
    with open(fname) as json_data:
        json_dict = json.load(json_data)
        for run_id in json_dict:
           d[run_id] = 1
    return d


def write_log_file(s, fname):
    f = open(fname, 'a')
    curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
    s = '[' + curr_time + ']: ' + s
    if not '\n' in s:
        s += '\n'
    f.write(s)
    f.close()

### main
if not os.path.exists(RNA_SEQ_INFO_FILE):
    print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
    sys.exit()
    
rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)


print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head

if len(sys.argv) > 1: # only get these samples specified in id-list.txt
    quant_files = get_quant_file_list(sys.argv[1])
else:
    quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')


include_count = 0
total = 0
already_added_dict = {}
for fn in sorted(quant_files):
    total += 1

    nzr = non_zero_ratio(fn)
    if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
        fn2 = os.path.basename(fn)
        data_name = 'None'
        if 'inhouse' in fn:
            myid = extract_id(fn, 'inhouse')
            myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
            data_name = 'LAB001' + '_LIB' + myid[-5:]
            desc = 'in-house'
            include_me = True
        elif 'public' in fn:
            myid = extract_id(fn2, 'sra')                
            myid2 = make_id('0',myid[0:3], myid[3:])
            data_name = myid
            desc = 'SRA'
            include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
        elif 'other' in fn:
            myid = extract_id(fn, 'other')
            # get lab id
            dirname = os.path.dirname(fn)
            lst = dirname.split('/')
            lab_id = lst[-2]
            myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
            data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
            desc = 'Other lab'
            include_me = True
        if include_me and not myid2 in already_added_dict:
            print('@%s' % (myid2))
            print('DATA_NAME:%s' % (data_name))
            print('DATA_FORMAT:%s'% ('txt'))
            print('DESCRIPTION:%s' % (desc))
            print('LOCATION:%s' % (fn))
            print('NOTE: non zero ratio is %4.2f' % (nzr))
            print('')
            include_count += 1
            already_added_dict[myid2] = 'yes'
    elif nzr < 0:
        write_log_file('[make_parameter_rnaseq.py] Warning: incomplete line in file %s' % (fn), UPDATE_NETWORK_LOG_FILE)        
    else:
        #print('%s has too many zeroes. ignore.' % (fn))
        pass

print('#Done.  Processed %d files.  Included %d files.' % (total, include_count))