summaryrefslogtreecommitdiff
path: root/Code/merge_edges.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/merge_edges.py')
-rw-r--r--Code/merge_edges.py190
1 files changed, 190 insertions, 0 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
new file mode 100644
index 0000000..ef870fb
--- /dev/null
+++ b/Code/merge_edges.py
@@ -0,0 +1,190 @@
+# Purpose: When edges.txt contains multiple lines representing the
+# same edge, choose only one edge.
+#
+# Usage: python merge_edges.py
+#
+# This script is used to produce the edges.txt for the brain
+# web application. It searches in EDGE_POOL_DIR for edge files
+# (with 10 columns) from many sources, most likely with
+# duplicated edges. It removes duplication and computes
+# strength for each edge.
+#
+# Note: make sure fname is edges.txt Rationale: to save place, I am no
+# longer going to use a full list of RNA-seq experiment IDs in the
+# fifth column. Use a number instead. This number is the length of
+# RNA-seq experiment IDs. If no IDs are available, this number is 1.
+# However, I am still going to keep a full list of ChIP-seq experiment
+# IDs (the sixth column).
+#
+# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+# Last modified on 5 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+import os, operator, sys, math, datetime, glob
+from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
+
+def get_number_of_RNAseq_ids(s):
+ if s == '.':
+ return 1
+ if s.isdigit():
+ return int(s)
+ return len(s.split())
+
+
+def add_dashes_to_date(s):
+ return s[:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def split_id_and_name(s):
+ lst = s.split()
+ result = lst[0]
+ if lst[0] != lst[1]:
+ result = lst[0] + ' (' + lst[1] + ')'
+ return result
+
+
+def make_html_list(s):
+ if s.strip() == '':
+ return 'None'
+ result = '<ul>'
+ for method in s.split(','):
+ result += '<li>%s</li>' % (method)
+ result += '</ul>'
+ return result
+
+
+def make_html_page(lst, fname):
+ tf = lst[1].split()[0] # ID only, no name
+ target = lst[0].split()[0]
+
+ head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
+ head += '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
+ s = '<html>'
+ s += '<head>%s</head>\n' % (head)
+ body = '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
+ body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
+ body += '<p>Association strength: %s.</p>\n' % (lst[8])
+ body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
+ body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
+ body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
+ body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
+ body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input data: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)
+ body += '<p id="chart"></p>\n'
+## if 'AT2G44304' in lst[0] and 'AT2G24700' in lst[1]:
+## print(lst)
+## sys.exit()
+
+ s += '<body>%s</body>\n' % (body)
+ s += '</html>'
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+def compute_time_difference_in_days(t1, t2):
+ ''' t1 and t2 has this format: yyyymmdd. '''
+ if not t1.isnumeric() and length(t1) != 8:
+ raise Exception('t1 format wrong in compute_time_difference_in_days.')
+ if not t2.isnumeric() and length(t2) != 8:
+ raise Exception('t2 format wrong in compute_time_difference_in_days.')
+
+ t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
+ t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
+ return (t1 - t2).days
+
+
+def make_new_edge(lst_tuple):
+ lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
+ best_edge = list( lst[0] )
+
+ # see section 'Ranking edges using frecency' in the brain documentation
+ F = len(lst_tuple)
+
+ RN_lst = []
+ r_lst = []
+ most_recent_edge_date = '00000000'
+ method_or_tissue = []
+ cids = ''
+ for t in lst:
+ r_lst.append( abs(float(t[2])) )
+ rids = t[4]
+ if t[5] > cids:
+ cids = t[5]
+ RN_lst.append( get_number_of_RNAseq_ids(rids) )
+ date = t[7]
+ if date > most_recent_edge_date:
+ most_recent_edge_date = date
+ method_or_tissue.append(t[9])
+ S = 365 * 10
+ curr_date = datetime.datetime.now().strftime('%Y%m%d')
+ #time_diff = int(most_recent_edge_date) - int(curr_date)
+ time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+ strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+ best_edge[4] = '%d' % max(RN_lst)
+ best_edge[5] = cids
+ best_edge[8] = '%.2f' % strength
+ best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
+
+## if 'AT2G44304' in best_edge[0] and 'AT2G24700' in best_edge[1]:
+## print(strength)
+## print(best_edge)
+## sys.exit()
+
+ return best_edge
+
+
+
+##main
+
+d = {}
+duniq = {}
+for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
+ print('[merge_edges.py]: including %s.' % (fname))
+ f = open(fname)
+
+ for line in f:
+ line = line.strip()
+ if len(line.split('\t')) == 10 and not line in duniq:
+ duniq[line] = 1
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ score = lst[2]
+ type_of_score = lst[3]
+ rids = lst[4]
+ cids = lst[5]
+ ll = lst[6]
+ date = lst[7]
+ strength = lst[8]
+ method_or_tissue = lst[9]
+
+ key = target + tf
+ t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
+
+ if not key in d:
+ d[key] = [t]
+ else:
+ d[key].append(t)
+
+ f.close()
+
+
+
+# make html pages
+folder_path = '../Data/temp/html_edges'
+if not os.path.isdir(folder_path):
+ os.mkdir(folder_path)
+
+
+print('[merge_edges.py]: Make text edge file...')
+fout = open(MERGED_EDGE_FILE, 'w')
+for k in d:
+ lst = make_new_edge(d[k])
+ fout.write('\t'.join(lst) + '\n')
+fout.close()
+
+
+print('[merge_edges.py]: Make html edge files. May take a while...')
+for k in d:
+ lst = make_new_edge(d[k])
+ pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
+ make_html_page(lst, folder_path + '/' + pagename)