diff options
Diffstat (limited to 'Code')
| -rw-r--r-- | Code/merge_edges.py | 360 | 
1 files changed, 180 insertions, 180 deletions
| diff --git a/Code/merge_edges.py b/Code/merge_edges.py index e0b1c61..936faf5 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -1,180 +1,180 @@ -# Purpose: When edges.txt.* contains multiple lines representing the
 -#          same edge, merge them and keep only one edge.
 -#
 -# Usage: python merge_edges.py
 -#
 -#        This script is used to produce a single file edges.txt for
 -#        the brain web application.  It searches in EDGE_POOL_DIR for
 -#        edge files (with 10 columns) from many sources, most likely
 -#        having duplicated edges.  It removes duplication and computes
 -#        strength for each edge.
 -#
 -# Note: make sure fname is edges.txt.
 -#
 -# Rationale: to save place, I am no longer going to use a full list of
 -# RNA-seq experiment IDs in the fifth column.  Use a number, i.e., the
 -# number of RNA-seq IDs, instead.  If no IDs are available, this
 -# number is 1 (very conservative).  However, I am still going to keep
 -# a full list of ChIP-seq experiment IDs (the sixth column).
 -#
 -# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
 -
 -
 -import os, operator, sys, math, datetime, glob
 -from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
 -
 -def get_number_of_RNAseq_ids(s):
 -    if s == '.':
 -        return 1
 -    if s.isdigit():
 -        return int(s)
 -    return len(s.split())
 -
 -
 -def add_dashes_to_date(s):
 -    return s[:4] + '-' + s[4:6] + '-' + s[6:]
 -
 -
 -def split_id_and_name(s):
 -    lst = s.split()
 -    result = lst[0]
 -    if lst[0] != lst[1]:
 -        result = lst[0] + ' (' + lst[1] + ')'
 -    return result
 -
 -
 -def make_html_list(s):
 -    if s.strip() == '':
 -        return 'None'
 -    result = '<ul>'
 -    for method in s.split(','):
 -        result += '<li>%s</li>' % (method)
 -    result += '</ul>'
 -    return result
 -        
 -
 -def make_html_page(lst, fname):
 -    tf     = lst[1].split()[0] # ID only, no name
 -    target = lst[0].split()[0] 
 -
 -    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
 -    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
 -    s = '<html>'
 -    s += '<head>%s</head>\n' % (head)
 -    body =  '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
 -    body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
 -    body += '<p>Association strength: %s.</p>\n' % (lst[8])
 -    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
 -    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
 -    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
 -    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
 -    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input data: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)    
 -    body += '<p id="chart"></p>\n'
 -    s += '<body>%s</body>\n' % (body)
 -    s += '</html>'
 -    f = open(fname, 'w')
 -    f.write(s)
 -    f.close()
 -
 -
 -def compute_time_difference_in_days(t1, t2):
 -    ''' t1 and t2 has this format: yyyymmdd. '''
 -    if not t1.isnumeric() and length(t1) != 8:
 -        raise Exception('t1 format wrong in compute_time_difference_in_days.')
 -    if not t2.isnumeric() and length(t2) != 8:
 -        raise Exception('t2 format wrong in compute_time_difference_in_days.')
 -    
 -    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
 -    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
 -    return (t1 - t2).days
 -
 -
 -def make_new_edge(lst_tuple):
 -    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
 -    best_edge = list( lst[0] )
 -
 -    # see section 'Ranking edges using frecency' in the brain documentation
 -    F = len(lst_tuple)
 -
 -    RN_lst = []
 -    r_lst = []
 -    most_recent_edge_date = '00000000'
 -    method_or_tissue = []
 -    cids = ''
 -    for t in lst:
 -        r_lst.append( abs(float(t[2])) )
 -        rids = t[4]
 -        if t[5] > cids:
 -            cids = t[5]
 -        RN_lst.append( get_number_of_RNAseq_ids(rids) )
 -        date = t[7]
 -        if date > most_recent_edge_date:
 -            most_recent_edge_date = date
 -        method_or_tissue.append(t[9])
 -    S = 365 * 10
 -    curr_date = datetime.datetime.now().strftime('%Y%m%d')
 -    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
 -    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
 -    best_edge[4] = '%d' % max(RN_lst)
 -    best_edge[5] = cids
 -    best_edge[8] = '%.2f' % strength
 -    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
 -    return best_edge
 -
 -    
 -
 -##main
 -
 -d = {}
 -duniq = {}
 -for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
 -    print('[merge_edges.py]: including %s.' % (fname))
 -    f = open(fname)
 -
 -    for line in f:
 -        line = line.strip()
 -        if len(line.split('\t')) == 10 and not line in duniq:
 -            duniq[line] = 1
 -            lst = line.split('\t')
 -            target = lst[0]
 -            tf = lst[1]
 -            score = lst[2]
 -            type_of_score = lst[3]
 -            rids = lst[4]
 -            cids = lst[5]
 -            ll = lst[6]
 -            date = lst[7]
 -            strength = lst[8]
 -            method_or_tissue = lst[9]
 -
 -            key = target + tf
 -            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
 -	
 -            if not key in d:
 -                d[key] = [t]
 -            else:
 -                d[key].append(t)
 -
 -    f.close()
 -
 -    
 -
 -# make html pages
 -folder_path = '../Data/temp/html_edges'
 -if not os.path.isdir(folder_path):
 -    os.mkdir(folder_path)
 -
 -
 -print('[merge_edges.py]: Make text edge file...')
 -fout = open(MERGED_EDGE_FILE, 'w')
 -for k in d:
 -    lst = make_new_edge(d[k])
 -    fout.write('\t'.join(lst) + '\n')
 -fout.close()
 -
 -
 -print('[merge_edges.py]: Make html edge files.  May take a while...')
 -for k in d:
 -    lst = make_new_edge(d[k])
 -    pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
 -    make_html_page(lst, folder_path + '/' + pagename)
 +# Purpose: When edges.txt.* contains multiple lines representing the +#          same edge, merge them and keep only one edge. +# +# Usage: python merge_edges.py +# +#        This script is used to produce a single file edges.txt for +#        the brain web application.  It searches in EDGE_POOL_DIR for +#        edge files (with 10 columns) from many sources, most likely +#        having duplicated edges.  It removes duplication and computes +#        strength for each edge. +# +# Note: make sure fname is edges.txt. +# +# Rationale: to save place, I am no longer going to use a full list of +# RNA-seq experiment IDs in the fifth column.  Use a number, i.e., the +# number of RNA-seq IDs, instead.  If no IDs are available, this +# number is 1 (very conservative).  However, I am still going to keep +# a full list of ChIP-seq experiment IDs (the sixth column). +# +# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn> + + +import os, operator, sys, math, datetime, glob +from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE + +def get_number_of_RNAseq_ids(s): +    if s == '.': +        return 1 +    if s.isdigit(): +        return int(s) +    return len(s.split()) + + +def add_dashes_to_date(s): +    return s[:4] + '-' + s[4:6] + '-' + s[6:] + + +def split_id_and_name(s): +    lst = s.split() +    result = lst[0] +    if lst[0] != lst[1]: +        result = lst[0] + ' (' + lst[1] + ')' +    return result + + +def make_html_list(s): +    if s.strip() == '': +        return 'None' +    result = '<ul>' +    for method in s.split(','): +        result += '<li>%s</li>' % (method) +    result += '</ul>' +    return result +         + +def make_html_page(lst, fname): +    tf     = lst[1].split()[0] # ID only, no name +    target = lst[0].split()[0]  + +    head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1]) +    head +=  '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>' +    s = '<html>' +    s += '<head>%s</head>\n' % (head) +    body =  '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1])) +    body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0])) +    body += '<p>Association strength: %s.</p>\n' % (lst[8]) +    body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7])) +    body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9])) +    body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA') +    body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target) +    body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool.  No installation is required.  Input data: <a href="json/%s.json">TF gene expression</a>  <a href="json/%s.json">Target gene expression</a>  <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)     +    body += '<p id="chart"></p>\n' +    s += '<body>%s</body>\n' % (body) +    s += '</html>' +    f = open(fname, 'w') +    f.write(s) +    f.close() + + +def compute_time_difference_in_days(t1, t2): +    ''' t1 and t2 has this format: yyyymmdd. ''' +    if not t1.isnumeric() and length(t1) != 8: +        raise Exception('t1 format wrong in compute_time_difference_in_days.') +    if not t2.isnumeric() and length(t2) != 8: +        raise Exception('t2 format wrong in compute_time_difference_in_days.') +     +    t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:])) +    t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:])) +    return (t1 - t2).days + + +def make_new_edge(lst_tuple): +    lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score +    best_edge = list( lst[0] ) + +    # see section 'Ranking edges using frecency' in the brain documentation +    F = len(lst_tuple) + +    RN_lst = [] +    r_lst = [] +    most_recent_edge_date = '00000000' +    method_or_tissue = [] +    cids = '' +    for t in lst: +        r_lst.append( abs(float(t[2])) ) +        rids = t[4] +        if t[5] > cids: +            cids = t[5] +        RN_lst.append( get_number_of_RNAseq_ids(rids) ) +        date = t[7] +        if date > most_recent_edge_date: +            most_recent_edge_date = date +        method_or_tissue.append(t[9]) +    S = 365 * 10 +    curr_date = datetime.datetime.now().strftime('%Y%m%d') +    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) +    strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) +    best_edge[4] = '%d' % max(RN_lst) +    best_edge[5] = cids +    best_edge[8] = '%.2f' % strength +    best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format +    return best_edge + +     + +##main + +d = {} +duniq = {} +for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))): +    print('[merge_edges.py]: including %s.' % (fname)) +    f = open(fname) + +    for line in f: +        line = line.strip() +        if len(line.split('\t')) == 10 and not line in duniq: +            duniq[line] = 1 +            lst = line.split('\t') +            target = lst[0] +            tf = lst[1] +            score = lst[2] +            type_of_score = lst[3] +            rids = lst[4] +            cids = lst[5] +            ll = lst[6] +            date = lst[7] +            strength = lst[8] +            method_or_tissue = lst[9] + +            key = target + tf +            t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) +	 +            if not key in d: +                d[key] = [t] +            else: +                d[key].append(t) + +    f.close() + +     + +# make html pages +folder_path = '../Data/temp/html_edges' +if not os.path.isdir(folder_path): +    os.mkdir(folder_path) + + +print('[merge_edges.py]: Make text edge file...') +fout = open(MERGED_EDGE_FILE, 'w') +for k in d: +    lst = make_new_edge(d[k]) +    fout.write('\t'.join(lst) + '\n') +fout.close() + + +print('[merge_edges.py]: Make html edge files.  May take a while...') +for k in d: +    lst = make_new_edge(d[k]) +    pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html +    make_html_page(lst, folder_path + '/' + pagename) | 
