Code/merge_edges.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

# Purpose: When edges.txt.* contains multiple lines representing the
#          same edge, merge them and keep only one edge.
#
# Usage: python merge_edges.py
#
#        This script is used to produce a single file edges.txt for
#        the brain web application.  It searches in EDGE_POOL_DIR for
#        edge files (with 10 columns) from many sources, most likely
#        having duplicated edges.  It removes duplication and computes
#        strength for each edge.
#
# Note: make sure fname is edges.txt.
#
# Rationale: to save place, I am no longer going to use a full list of
# RNA-seq experiment IDs in the fifth column.  Use a number, i.e., the
# number of RNA-seq IDs, instead.  If no IDs are available, this
# number is 1 (very conservative).  However, I am still going to keep
# a full list of ChIP-seq experiment IDs (the sixth column).
#
# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>


import os, operator, sys, math, datetime, glob
from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE, UPDATE_NETWORK_LOG_FILE
import sqlite3


def write_log_file(s, fname):
    if not os.path.exists(fname):
        return None
    f = open(fname, 'a')
    curr_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    s = '[' + curr_time + ']: ' + s
    if not '\n' in s:
        s += '\n'
    f.write(s)
    f.close()


def get_number_of_RNAseq_ids(s):
    if s == '.':
        return 1
    if s.isdigit():
        return int(s)
    return len(s.split())


def add_dashes_to_date(s):
    return s[:4] + '-' + s[4:6] + '-' + s[6:]


def split_id_and_name(s):
    lst = s.split()
    result = lst[0]
    if lst[0] != lst[1]:
        result = lst[0] + ' (' + lst[1] + ')'
    return result


def make_html_list(s):
    if s.strip() == '':
        return 'None'
    result = '<ul>'
    for method in s.split(','):
        result += '<li>%s</li>' % (method)
    result += '</ul>'
    return result
        

def make_html_page(lst, fname):
    tf     = lst[1].split()[0] # ID only, no name
    target = lst[0].split()[0] 

    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
    s = '<html>'
    s += '<head>%s</head>\n' % (head)
    body =  '<p>TF: %s</p>\n' % (split_id_and_name(lst[1]))
    body += '<p>Target: %s</p>\n' % (split_id_and_name(lst[0]))
    body += '<p>Association strength: %s</p>\n' % (lst[8])
    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Gene expression scatter plot</a>\n' % (tf, target)
    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)    
    body += '<p id="chart"></p>\n'
    s += '<body>%s</body>\n' % (body)
    s += '</html>'
    f = open(fname, 'w')
    f.write(s)
    f.close()


def fill_database(lst, conn):
    ''' Store all edge information in a SQLite database, which can be retrieved in the Webapp.'''
    tf     = lst[1].split()[0] # ID only, no name
    tf_name = split_id_and_name(lst[1])    
    target = lst[0].split()[0]
    target_name = split_id_and_name(lst[0])
    strength = lst[8]
    edge_date = add_dashes_to_date(lst[7])
    method = make_html_list(lst[9])
    evidence = lst[5] if lst[5] != '.' else 'TBA'
    conn.execute('CREATE TABLE IF NOT EXISTS edge (target_id text, target_name text, tf_id text, tf_name text, strength text, date text, method text, evidence text)')
    conn.execute('INSERT INTO edge (target_id, target_name, tf_id, tf_name, strength, date, method, evidence) VALUES (?,?,?,?,?,?,?,?)', (target, target_name, tf, tf_name, strength, edge_date, method, evidence))
    conn.commit()


def compute_time_difference_in_days(t1, t2):
    ''' t1 and t2 has this format: yyyymmdd. '''
    if not t1.isnumeric() and length(t1) != 8:
        raise Exception('t1 format wrong in compute_time_difference_in_days.')
    if not t2.isnumeric() and length(t2) != 8:
        raise Exception('t2 format wrong in compute_time_difference_in_days.')
    
    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
    return (t1 - t2).days


def make_new_edge(lst_tuple):
    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort tuples according to absolute value of score
    best_edge = list( lst[0] )  # use the first tuple as a basis edge

    # see section 'Ranking edges using frecency' in the brain documentation 
    F = len(lst_tuple)

    RN_lst = []
    r_lst = []
    most_recent_edge_date = '00000000'
    method_or_tissue = []
    cids = ''
    for t in lst:
        r_lst.append( abs(float(t[2])) )
        rids = t[4]
        if t[5] > cids:
            cids = t[5]
        RN_lst.append( get_number_of_RNAseq_ids(rids) )
        date = t[7]
        if date > most_recent_edge_date:
            most_recent_edge_date = date
        method_or_tissue.append(t[9])
    S = 365 * 10
    curr_date = datetime.datetime.now().strftime('%Y%m%d')
    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
    best_edge[4] = '%d' % max(RN_lst)
    best_edge[5] = cids
    best_edge[7] = most_recent_edge_date
    best_edge[8] = '%.2f' % strength
    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
    return best_edge


def get_unique_cids(lst):
    ''' Return a list of unique, sorted ChIP-seq IDs. '''
    cids = []
    for x in lst:
        sublst = x.split()
        cids.extend(sublst)
    result = sorted(list(set(cids)))
    if len(result) > 1 and result[0] == '.':
        result.pop(0)
    return ' '.join(result)


def make_new_edge2(d):
    best_edge = list(d['best_edge'])
    S = 365 * 10
    curr_date = datetime.datetime.now().strftime('%Y%m%d')
    most_recent_edge_date = d['stat']['most_recent_edge_date']
    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
    F = d['stat']['F']
    strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
    best_edge[4] = '%d' % d['stat']['maxRN']
    best_edge[5] = get_unique_cids(d['stat']['allCID'])
    best_edge[7] = most_recent_edge_date
    best_edge[8] = '%.2f' % strength
    method_or_tissue = d['stat']['method_or_tissue']
    best_edge[9] = ','.join(sorted(list(set(method_or_tissue))))
    return best_edge
    

##main

d = {} # d will contain all edges computed so far, where the key is TargetGeneID_TFGeneID, and the value is a list of tuples.  Each tuple is a historical edge.
file_count = 0
for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))):
    file_count += 1
    print('[merge_edges.py]: including %s.' % (fname))
    #write_log_file('[merge_edges.py] including file %s.' % (fname) , UPDATE_NETWORK_LOG_FILE)
    f = open(fname)

    for line in f:
        line = line.strip()
        if len(line.split('\t')) == 10:  # a valid edge line has 10 fields, separated by a TAB
            lst = line.split('\t')
            target = lst[0]
            tf = lst[1]
            score = lst[2]
            type_of_score = lst[3]
            rids = lst[4]
            cids = lst[5]
            ll = lst[6]
            date = lst[7]
            strength = lst[8]
            method_or_tissue = lst[9]

            key = target.split()[0] + '_' + tf.split()[0] # target or tf has two fields, Gene ID and Gene Name, split()[0] means using Gene ID only.
            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
	
            if not key in d:
                d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}}
            else:
                # update best edge
                old_score = float(d[key]['best_edge'][2])
                new_score = float(score)
                if abs(new_score) > abs(old_score):
                    d[key]['best_edge']  = t

                # update stat information
                d[key]['stat']['sumr'] += abs(new_score)
                d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids)
                d[key]['stat']['F'] += 1

                if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']:
                    d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids)
                if date > d[key]['stat']['most_recent_edge_date']:
                    d[key]['stat']['most_recent_edge_date'] = date
                if not cids in d[key]['stat']['allCID']:
                    d[key]['stat']['allCID'].append(cids)
                if not method_or_tissue in d[key]['stat']['method_or_tissue']:
                    d[key]['stat']['method_or_tissue'].append(method_or_tissue)

    f.close()

write_log_file('[merge_edges.py] BRAIN has collected edges from %d files.' % (file_count) , UPDATE_NETWORK_LOG_FILE)            

# make html pages
folder_path = '../Data/temp/html_edges'
if not os.path.isdir(folder_path):
    os.mkdir(folder_path)


write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE)
fout = open(MERGED_EDGE_FILE, 'w')
for k in d:
    lst = make_new_edge2(d[k])
    fout.write('\t'.join(lst) + '\n')
fout.close()


db_fname = os.path.join(folder_path, 'edges.sqlite')
write_log_file('[merge_edges.py]: Make SQLite database file %s.  May take a while ...' % (db_fname), UPDATE_NETWORK_LOG_FILE)


if os.path.exists(db_fname):
    os.remove(db_fname)

conn = sqlite3.connect(db_fname)
for k in d:
    lst = make_new_edge2(d[k])
    # Make an html page for each edge (taking Big disk space).  This will take about 5GB disk space
    # for 1.3 million edges, not very disk space friendly.  So I use a database-driven dynamic method
    # to save space.
    # pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
    # make_html_page(lst, folder_path + '/' + pagename)  
    # Write to a SQLite database file called edges.sqlite, which will be used for the Webapp.
    # edges.sqlite will be put under static/edges/ for querying.
    fill_database(lst, conn)

conn.close()