Code/merge_edges.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

# Purpose: When edges.txt contains multiple lines representing the
#          same edge, choose only one edge.
#
# Usage: python merge_edges.py
#
#        This script is used to produce the edges.txt for the brain
#        web application.  It searches in EDGE_POOL_DIR for edge files
#        (with 10 columns) from many sources, most likely with
#        duplicated edges.  It removes duplication and computes
#        strength for each edge.
#
# Note: make sure fname is edges.txt Rationale: to save place, I am no
# longer going to use a full list of RNA-seq experiment IDs in the
# fifth column. Use a number instead.  This number is the length of
# RNA-seq experiment IDs.  If no IDs are available, this number is 1.
# However, I am still going to keep a full list of ChIP-seq experiment
# IDs (the sixth column).
#
# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
# Last modified on 5 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>

import os, operator, sys, math, datetime, glob
from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE

def get_number_of_RNAseq_ids(s):
    if s == '.':
        return 1
    if s.isdigit():
        return int(s)
    return len(s.split())


def add_dashes_to_date(s):
    return s[:4] + '-' + s[4:6] + '-' + s[6:]


def split_id_and_name(s):
    lst = s.split()
    result = lst[0]
    if lst[0] != lst[1]:
        result = lst[0] + ' (' + lst[1] + ')'
    return result


def make_html_list(s):
    if s.strip() == '':
        return 'None'
    result = '<ul>'
    for method in s.split(','):
        result += '<li>%s</li>' % (method)
    result += '</ul>'
    return result
        

def make_html_page(lst, fname):
    tf     = lst[1].split()[0] # ID only, no name
    target = lst[0].split()[0] 

    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
    s = '<html>'
    s += '<head>%s</head>\n' % (head)
    body =  '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
    body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
    body += '<p>Association strength: %s.</p>\n' % (lst[8])
    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input data: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)    
    body += '<p id="chart"></p>\n'
##    if 'AT2G44304' in lst[0] and 'AT2G24700' in lst[1]:
##        print(lst)
##        sys.exit()
        
    s += '<body>%s</body>\n' % (body)
    s += '</html>'
    f = open(fname, 'w')
    f.write(s)
    f.close()


def compute_time_difference_in_days(t1, t2):
    ''' t1 and t2 has this format: yyyymmdd. '''
    if not t1.isnumeric() and length(t1) != 8:
        raise Exception('t1 format wrong in compute_time_difference_in_days.')
    if not t2.isnumeric() and length(t2) != 8:
        raise Exception('t2 format wrong in compute_time_difference_in_days.')
    
    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
    return (t1 - t2).days


def make_new_edge(lst_tuple):
    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
    best_edge = list( lst[0] )

    # see section 'Ranking edges using frecency' in the brain documentation
    F = len(lst_tuple)

    RN_lst = []
    r_lst = []
    most_recent_edge_date = '00000000'
    method_or_tissue = []
    cids = ''
    for t in lst:
        r_lst.append( abs(float(t[2])) )
        rids = t[4]
        if t[5] > cids:
            cids = t[5]
        RN_lst.append( get_number_of_RNAseq_ids(rids) )
        date = t[7]
        if date > most_recent_edge_date:
            most_recent_edge_date = date
        method_or_tissue.append(t[9])
    S = 365 * 10
    curr_date = datetime.datetime.now().strftime('%Y%m%d')
    #time_diff = int(most_recent_edge_date) - int(curr_date)
    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
    best_edge[4] = '%d' % max(RN_lst)
    best_edge[5] = cids
    best_edge[8] = '%.2f' % strength
    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format

##    if 'AT2G44304' in best_edge[0] and 'AT2G24700' in best_edge[1]:
##        print(strength)
##        print(best_edge)
##        sys.exit()
        
    return best_edge

    
##main

d = {}
duniq = {}
for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
    print('[merge_edges.py]: including %s.' % (fname))
    f = open(fname)

    for line in f:
        line = line.strip()
        if len(line.split('\t')) == 10 and not line in duniq:
            duniq[line] = 1
            lst = line.split('\t')
            target = lst[0]
            tf = lst[1]
            score = lst[2]
            type_of_score = lst[3]
            rids = lst[4]
            cids = lst[5]
            ll = lst[6]
            date = lst[7]
            strength = lst[8]
            method_or_tissue = lst[9]

            key = target + tf
            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
	
            if not key in d:
                d[key] = [t]
            else:
                d[key].append(t)

    f.close()

    
# make html pages
folder_path = '../Data/temp/html_edges'
if not os.path.isdir(folder_path):
    os.mkdir(folder_path)


print('[merge_edges.py]: Make text edge file...')
fout = open(MERGED_EDGE_FILE, 'w')
for k in d:
    lst = make_new_edge(d[k])
    fout.write('\t'.join(lst) + '\n')
fout.close()


print('[merge_edges.py]: Make html edge files.  May take a while...')
for k in d:
    lst = make_new_edge(d[k])
    pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
    make_html_page(lst, folder_path + '/' + pagename)