From dc8de851f113162b56e6dbf423d775aeac4d9d89 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:23:24 +0800 Subject: merge_edges.py: run dos2unix on merge_edges.py to remove ^M characters. --- Code/merge_edges.py | 360 ++++++++++++++++++++++++++-------------------------- 1 file changed, 180 insertions(+), 180 deletions(-) diff --git a/Code/merge_edges.py b/Code/merge_edges.py index e0b1c61..936faf5 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -1,180 +1,180 @@ -# Purpose: When edges.txt.* contains multiple lines representing the -# same edge, merge them and keep only one edge. -# -# Usage: python merge_edges.py -# -# This script is used to produce a single file edges.txt for -# the brain web application. It searches in EDGE_POOL_DIR for -# edge files (with 10 columns) from many sources, most likely -# having duplicated edges. It removes duplication and computes -# strength for each edge. -# -# Note: make sure fname is edges.txt. -# -# Rationale: to save place, I am no longer going to use a full list of -# RNA-seq experiment IDs in the fifth column. Use a number, i.e., the -# number of RNA-seq IDs, instead. If no IDs are available, this -# number is 1 (very conservative). However, I am still going to keep -# a full list of ChIP-seq experiment IDs (the sixth column). -# -# Created on 3 August 2019 by Hui Lan - - -import os, operator, sys, math, datetime, glob -from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE - -def get_number_of_RNAseq_ids(s): - if s == '.': - return 1 - if s.isdigit(): - return int(s) - return len(s.split()) - - -def add_dashes_to_date(s): - return s[:4] + '-' + s[4:6] + '-' + s[6:] - - -def split_id_and_name(s): - lst = s.split() - result = lst[0] - if lst[0] != lst[1]: - result = lst[0] + ' (' + lst[1] + ')' - return result - - -def make_html_list(s): - if s.strip() == '': - return 'None' - result = '' - return result - - -def make_html_page(lst, fname): - tf = lst[1].split()[0] # ID only, no name - target = lst[0].split()[0] - - head = '%s\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1]) - head += '\n\n\n' - s = '' - s += '%s\n' % (head) - body = '

TF is %s.

\n' % (split_id_and_name(lst[1])) - body += '

Target is %s.

\n' % (split_id_and_name(lst[0])) - body += '

Association strength: %s.

\n' % (lst[8]) - body += '

Edge made on %s.

\n' % (add_dashes_to_date(lst[7])) - body += '

Methods: %s

\n' % (make_html_list(lst[9])) - body += '

Evidence of binding: %s.

\n' % (lst[5] if lst[5] != '.' else 'TBA') - body += 'Click for gene expression scatter plot\n' % (tf, target) - body += '

For more detailed analysis, download our gene expression scatter plotting tool. No installation is required. Input data: TF gene expression Target gene expression RNA-seq annotation

\n' % (tf, target) - body += '

\n' - s += '%s\n' % (body) - s += '' - f = open(fname, 'w') - f.write(s) - f.close() - - -def compute_time_difference_in_days(t1, t2): - ''' t1 and t2 has this format: yyyymmdd. ''' - if not t1.isnumeric() and length(t1) != 8: - raise Exception('t1 format wrong in compute_time_difference_in_days.') - if not t2.isnumeric() and length(t2) != 8: - raise Exception('t2 format wrong in compute_time_difference_in_days.') - - t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:])) - t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:])) - return (t1 - t2).days - - -def make_new_edge(lst_tuple): - lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score - best_edge = list( lst[0] ) - - # see section 'Ranking edges using frecency' in the brain documentation - F = len(lst_tuple) - - RN_lst = [] - r_lst = [] - most_recent_edge_date = '00000000' - method_or_tissue = [] - cids = '' - for t in lst: - r_lst.append( abs(float(t[2])) ) - rids = t[4] - if t[5] > cids: - cids = t[5] - RN_lst.append( get_number_of_RNAseq_ids(rids) ) - date = t[7] - if date > most_recent_edge_date: - most_recent_edge_date = date - method_or_tissue.append(t[9]) - S = 365 * 10 - curr_date = datetime.datetime.now().strftime('%Y%m%d') - time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) - strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) - best_edge[4] = '%d' % max(RN_lst) - best_edge[5] = cids - best_edge[8] = '%.2f' % strength - best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format - return best_edge - - - -##main - -d = {} -duniq = {} -for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))): - print('[merge_edges.py]: including %s.' % (fname)) - f = open(fname) - - for line in f: - line = line.strip() - if len(line.split('\t')) == 10 and not line in duniq: - duniq[line] = 1 - lst = line.split('\t') - target = lst[0] - tf = lst[1] - score = lst[2] - type_of_score = lst[3] - rids = lst[4] - cids = lst[5] - ll = lst[6] - date = lst[7] - strength = lst[8] - method_or_tissue = lst[9] - - key = target + tf - t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) - - if not key in d: - d[key] = [t] - else: - d[key].append(t) - - f.close() - - - -# make html pages -folder_path = '../Data/temp/html_edges' -if not os.path.isdir(folder_path): - os.mkdir(folder_path) - - -print('[merge_edges.py]: Make text edge file...') -fout = open(MERGED_EDGE_FILE, 'w') -for k in d: - lst = make_new_edge(d[k]) - fout.write('\t'.join(lst) + '\n') -fout.close() - - -print('[merge_edges.py]: Make html edge files. May take a while...') -for k in d: - lst = make_new_edge(d[k]) - pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html - make_html_page(lst, folder_path + '/' + pagename) +# Purpose: When edges.txt.* contains multiple lines representing the +# same edge, merge them and keep only one edge. +# +# Usage: python merge_edges.py +# +# This script is used to produce a single file edges.txt for +# the brain web application. It searches in EDGE_POOL_DIR for +# edge files (with 10 columns) from many sources, most likely +# having duplicated edges. It removes duplication and computes +# strength for each edge. +# +# Note: make sure fname is edges.txt. +# +# Rationale: to save place, I am no longer going to use a full list of +# RNA-seq experiment IDs in the fifth column. Use a number, i.e., the +# number of RNA-seq IDs, instead. If no IDs are available, this +# number is 1 (very conservative). However, I am still going to keep +# a full list of ChIP-seq experiment IDs (the sixth column). +# +# Created on 3 August 2019 by Hui Lan + + +import os, operator, sys, math, datetime, glob +from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE + +def get_number_of_RNAseq_ids(s): + if s == '.': + return 1 + if s.isdigit(): + return int(s) + return len(s.split()) + + +def add_dashes_to_date(s): + return s[:4] + '-' + s[4:6] + '-' + s[6:] + + +def split_id_and_name(s): + lst = s.split() + result = lst[0] + if lst[0] != lst[1]: + result = lst[0] + ' (' + lst[1] + ')' + return result + + +def make_html_list(s): + if s.strip() == '': + return 'None' + result = '
    ' + for method in s.split(','): + result += '
  • %s
  • ' % (method) + result += '
' + return result + + +def make_html_page(lst, fname): + tf = lst[1].split()[0] # ID only, no name + target = lst[0].split()[0] + + head = '%s\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1]) + head += '\n\n\n' + s = '' + s += '%s\n' % (head) + body = '

TF is %s.

\n' % (split_id_and_name(lst[1])) + body += '

Target is %s.

\n' % (split_id_and_name(lst[0])) + body += '

Association strength: %s.

\n' % (lst[8]) + body += '

Edge made on %s.

\n' % (add_dashes_to_date(lst[7])) + body += '

Methods: %s

\n' % (make_html_list(lst[9])) + body += '

Evidence of binding: %s.

\n' % (lst[5] if lst[5] != '.' else 'TBA') + body += 'Click for gene expression scatter plot\n' % (tf, target) + body += '

For more detailed analysis, download our gene expression scatter plotting tool. No installation is required. Input data: TF gene expression Target gene expression RNA-seq annotation

\n' % (tf, target) + body += '

\n' + s += '%s\n' % (body) + s += '' + f = open(fname, 'w') + f.write(s) + f.close() + + +def compute_time_difference_in_days(t1, t2): + ''' t1 and t2 has this format: yyyymmdd. ''' + if not t1.isnumeric() and length(t1) != 8: + raise Exception('t1 format wrong in compute_time_difference_in_days.') + if not t2.isnumeric() and length(t2) != 8: + raise Exception('t2 format wrong in compute_time_difference_in_days.') + + t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:])) + t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:])) + return (t1 - t2).days + + +def make_new_edge(lst_tuple): + lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score + best_edge = list( lst[0] ) + + # see section 'Ranking edges using frecency' in the brain documentation + F = len(lst_tuple) + + RN_lst = [] + r_lst = [] + most_recent_edge_date = '00000000' + method_or_tissue = [] + cids = '' + for t in lst: + r_lst.append( abs(float(t[2])) ) + rids = t[4] + if t[5] > cids: + cids = t[5] + RN_lst.append( get_number_of_RNAseq_ids(rids) ) + date = t[7] + if date > most_recent_edge_date: + most_recent_edge_date = date + method_or_tissue.append(t[9]) + S = 365 * 10 + curr_date = datetime.datetime.now().strftime('%Y%m%d') + time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) + strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) + best_edge[4] = '%d' % max(RN_lst) + best_edge[5] = cids + best_edge[8] = '%.2f' % strength + best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format + return best_edge + + + +##main + +d = {} +duniq = {} +for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))): + print('[merge_edges.py]: including %s.' % (fname)) + f = open(fname) + + for line in f: + line = line.strip() + if len(line.split('\t')) == 10 and not line in duniq: + duniq[line] = 1 + lst = line.split('\t') + target = lst[0] + tf = lst[1] + score = lst[2] + type_of_score = lst[3] + rids = lst[4] + cids = lst[5] + ll = lst[6] + date = lst[7] + strength = lst[8] + method_or_tissue = lst[9] + + key = target + tf + t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) + + if not key in d: + d[key] = [t] + else: + d[key].append(t) + + f.close() + + + +# make html pages +folder_path = '../Data/temp/html_edges' +if not os.path.isdir(folder_path): + os.mkdir(folder_path) + + +print('[merge_edges.py]: Make text edge file...') +fout = open(MERGED_EDGE_FILE, 'w') +for k in d: + lst = make_new_edge(d[k]) + fout.write('\t'.join(lst) + '\n') +fout.close() + + +print('[merge_edges.py]: Make html edge files. May take a while...') +for k in d: + lst = make_new_edge(d[k]) + pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html + make_html_page(lst, folder_path + '/' + pagename) -- cgit v1.2.1