1 files changed, 180 insertions, 180 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
index e0b1c61..936faf5 100644
--- a/Code/merge_edges.py
+++ b/Code/merge_edges.py
@@ -1,180 +1,180 @@
-# Purpose: When edges.txt.* contains multiple lines representing the
-#          same edge, merge them and keep only one edge.
-#
-# Usage: python merge_edges.py
-#
-#        This script is used to produce a single file edges.txt for
-#        the brain web application.  It searches in EDGE_POOL_DIR for
-#        edge files (with 10 columns) from many sources, most likely
-#        having duplicated edges.  It removes duplication and computes
-#        strength for each edge.
-#
-# Note: make sure fname is edges.txt.
-#
-# Rationale: to save place, I am no longer going to use a full list of
-# RNA-seq experiment IDs in the fifth column.  Use a number, i.e., the
-# number of RNA-seq IDs, instead.  If no IDs are available, this
-# number is 1 (very conservative).  However, I am still going to keep
-# a full list of ChIP-seq experiment IDs (the sixth column).
-#
-# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
-
-
-import os, operator, sys, math, datetime, glob
-from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
-
-def get_number_of_RNAseq_ids(s):
-    if s == '.':
-        return 1
-    if s.isdigit():
-        return int(s)
-    return len(s.split())
-
-
-def add_dashes_to_date(s):
-    return s[:4] + '-' + s[4:6] + '-' + s[6:]
-
-
-def split_id_and_name(s):
-    lst = s.split()
-    result = lst[0]
-    if lst[0] != lst[1]:
-        result = lst[0] + ' (' + lst[1] + ')'
-    return result
-
-
-def make_html_list(s):
-    if s.strip() == '':
-        return 'None'
-    result = '<ul>'
-    for method in s.split(','):
-        result += '<li>%s</li>' % (method)
-    result += '</ul>'
-    return result
-        
-
-def make_html_page(lst, fname):
-    tf     = lst[1].split()[0] # ID only, no name
-    target = lst[0].split()[0] 
-
-    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
-    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
-    s = '<html>'
-    s += '<head>%s</head>\n' % (head)
-    body =  '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
-    body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
-    body += '<p>Association strength: %s.</p>\n' % (lst[8])
-    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
-    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
-    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
-    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
-    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input data: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)    
-    body += '<p id="chart"></p>\n'
-    s += '<body>%s</body>\n' % (body)
-    s += '</html>'
-    f = open(fname, 'w')
-    f.write(s)
-    f.close()
-
-
-def compute_time_difference_in_days(t1, t2):
-    ''' t1 and t2 has this format: yyyymmdd. '''
-    if not t1.isnumeric() and length(t1) != 8:
-        raise Exception('t1 format wrong in compute_time_difference_in_days.')
-    if not t2.isnumeric() and length(t2) != 8:
-        raise Exception('t2 format wrong in compute_time_difference_in_days.')
-    
-    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
-    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
-    return (t1 - t2).days
-
-
-def make_new_edge(lst_tuple):
-    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
-    best_edge = list( lst[0] )
-
-    # see section 'Ranking edges using frecency' in the brain documentation
-    F = len(lst_tuple)
-
-    RN_lst = []
-    r_lst = []
-    most_recent_edge_date = '00000000'
-    method_or_tissue = []
-    cids = ''
-    for t in lst:
-        r_lst.append( abs(float(t[2])) )
-        rids = t[4]
-        if t[5] > cids:
-            cids = t[5]
-        RN_lst.append( get_number_of_RNAseq_ids(rids) )
-        date = t[7]
-        if date > most_recent_edge_date:
-            most_recent_edge_date = date
-        method_or_tissue.append(t[9])
-    S = 365 * 10
-    curr_date = datetime.datetime.now().strftime('%Y%m%d')
-    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
-    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
-    best_edge[4] = '%d' % max(RN_lst)
-    best_edge[5] = cids
-    best_edge[8] = '%.2f' % strength
-    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
-    return best_edge
-
-    
-
-##main
-
-d = {}
-duniq = {}
-for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
-    print('[merge_edges.py]: including %s.' % (fname))
-    f = open(fname)
-
-    for line in f:
-        line = line.strip()
-        if len(line.split('\t')) == 10 and not line in duniq:
-            duniq[line] = 1
-            lst = line.split('\t')
-            target = lst[0]
-            tf = lst[1]
-            score = lst[2]
-            type_of_score = lst[3]
-            rids = lst[4]
-            cids = lst[5]
-            ll = lst[6]
-            date = lst[7]
-            strength = lst[8]
-            method_or_tissue = lst[9]
-
-            key = target + tf
-            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
-	
-            if not key in d:
-                d[key] = [t]
-            else:
-                d[key].append(t)
-
-    f.close()
-
-    
-
-# make html pages
-folder_path = '../Data/temp/html_edges'
-if not os.path.isdir(folder_path):
-    os.mkdir(folder_path)
-
-
-print('[merge_edges.py]: Make text edge file...')
-fout = open(MERGED_EDGE_FILE, 'w')
-for k in d:
-    lst = make_new_edge(d[k])
-    fout.write('\t'.join(lst) + '\n')
-fout.close()
-
-
-print('[merge_edges.py]: Make html edge files.  May take a while...')
-for k in d:
-    lst = make_new_edge(d[k])
-    pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
-    make_html_page(lst, folder_path + '/' + pagename)
+# Purpose: When edges.txt.* contains multiple lines representing the
+#          same edge, merge them and keep only one edge.
+#
+# Usage: python merge_edges.py
+#
+#        This script is used to produce a single file edges.txt for
+#        the brain web application.  It searches in EDGE_POOL_DIR for
+#        edge files (with 10 columns) from many sources, most likely
+#        having duplicated edges.  It removes duplication and computes
+#        strength for each edge.
+#
+# Note: make sure fname is edges.txt.
+#
+# Rationale: to save place, I am no longer going to use a full list of
+# RNA-seq experiment IDs in the fifth column.  Use a number, i.e., the
+# number of RNA-seq IDs, instead.  If no IDs are available, this
+# number is 1 (very conservative).  However, I am still going to keep
+# a full list of ChIP-seq experiment IDs (the sixth column).
+#
+# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+
+import os, operator, sys, math, datetime, glob
+from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
+
+def get_number_of_RNAseq_ids(s):
+    if s == '.':
+        return 1
+    if s.isdigit():
+        return int(s)
+    return len(s.split())
+
+
+def add_dashes_to_date(s):
+    return s[:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def split_id_and_name(s):
+    lst = s.split()
+    result = lst[0]
+    if lst[0] != lst[1]:
+        result = lst[0] + ' (' + lst[1] + ')'
+    return result
+
+
+def make_html_list(s):
+    if s.strip() == '':
+        return 'None'
+    result = '<ul>'
+    for method in s.split(','):
+        result += '<li>%s</li>' % (method)
+    result += '</ul>'
+    return result
+        
+
+def make_html_page(lst, fname):
+    tf     = lst[1].split()[0] # ID only, no name
+    target = lst[0].split()[0] 
+
+    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
+    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
+    s = '<html>'
+    s += '<head>%s</head>\n' % (head)
+    body =  '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
+    body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
+    body += '<p>Association strength: %s.</p>\n' % (lst[8])
+    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
+    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
+    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
+    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
+    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input data: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)    
+    body += '<p id="chart"></p>\n'
+    s += '<body>%s</body>\n' % (body)
+    s += '</html>'
+    f = open(fname, 'w')
+    f.write(s)
+    f.close()
+
+
+def compute_time_difference_in_days(t1, t2):
+    ''' t1 and t2 has this format: yyyymmdd. '''
+    if not t1.isnumeric() and length(t1) != 8:
+        raise Exception('t1 format wrong in compute_time_difference_in_days.')
+    if not t2.isnumeric() and length(t2) != 8:
+        raise Exception('t2 format wrong in compute_time_difference_in_days.')
+    
+    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
+    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
+    return (t1 - t2).days
+
+
+def make_new_edge(lst_tuple):
+    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
+    best_edge = list( lst[0] )
+
+    # see section 'Ranking edges using frecency' in the brain documentation
+    F = len(lst_tuple)
+
+    RN_lst = []
+    r_lst = []
+    most_recent_edge_date = '00000000'
+    method_or_tissue = []
+    cids = ''
+    for t in lst:
+        r_lst.append( abs(float(t[2])) )
+        rids = t[4]
+        if t[5] > cids:
+            cids = t[5]
+        RN_lst.append( get_number_of_RNAseq_ids(rids) )
+        date = t[7]
+        if date > most_recent_edge_date:
+            most_recent_edge_date = date
+        method_or_tissue.append(t[9])
+    S = 365 * 10
+    curr_date = datetime.datetime.now().strftime('%Y%m%d')
+    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+    best_edge[4] = '%d' % max(RN_lst)
+    best_edge[5] = cids
+    best_edge[8] = '%.2f' % strength
+    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
+    return best_edge
+
+    
+
+##main
+
+d = {}
+duniq = {}
+for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
+    print('[merge_edges.py]: including %s.' % (fname))
+    f = open(fname)
+
+    for line in f:
+        line = line.strip()
+        if len(line.split('\t')) == 10 and not line in duniq:
+            duniq[line] = 1
+            lst = line.split('\t')
+            target = lst[0]
+            tf = lst[1]
+            score = lst[2]
+            type_of_score = lst[3]
+            rids = lst[4]
+            cids = lst[5]
+            ll = lst[6]
+            date = lst[7]
+            strength = lst[8]
+            method_or_tissue = lst[9]
+
+            key = target + tf
+            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
+	
+            if not key in d:
+                d[key] = [t]
+            else:
+                d[key].append(t)
+
+    f.close()
+
+    
+
+# make html pages
+folder_path = '../Data/temp/html_edges'
+if not os.path.isdir(folder_path):
+    os.mkdir(folder_path)
+
+
+print('[merge_edges.py]: Make text edge file...')
+fout = open(MERGED_EDGE_FILE, 'w')
+for k in d:
+    lst = make_new_edge(d[k])
+    fout.write('\t'.join(lst) + '\n')
+fout.close()
+
+
+print('[merge_edges.py]: Make html edge files.  May take a while...')
+for k in d:
+    lst = make_new_edge(d[k])
+    pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
+    make_html_page(lst, folder_path + '/' + pagename)