%s

# Purpose: When edges.txt.* contains multiple lines representing the # same edge, merge them and keep only one edge. # # Usage: python merge_edges.py # # This script is used to produce a single file edges.txt for # the brain web application. It searches in EDGE_POOL_DIR for # edge files (with 10 columns) from many sources, most likely # having duplicated edges. It removes duplication and computes # strength for each edge. # # Note: make sure fname is edges.txt. # # Rationale: to save place, I am no longer going to use a full list of # RNA-seq experiment IDs in the fifth column. Use a number, i.e., the # number of RNA-seq IDs, instead. If no IDs are available, this # number is 1 (very conservative). However, I am still going to keep # a full list of ChIP-seq experiment IDs (the sixth column). # # Created on 3 August 2019 by Hui Lan import os, operator, sys, math, datetime, glob from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE, UPDATE_NETWORK_LOG_FILE import sqlite3 def write_log_file(s, fname): if not os.path.exists(fname): return None f = open(fname, 'a') curr_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') s = '[' + curr_time + ']: ' + s if not '\n' in s: s += '\n' f.write(s) f.close() def get_number_of_RNAseq_ids(s): if s == '.': return 1 if s.isdigit(): return int(s) return len(s.split()) def add_dashes_to_date(s): return s[:4] + '-' + s[4:6] + '-' + s[6:] def split_id_and_name(s): lst = s.split() result = lst[0] if lst[0] != lst[1]: result = lst[0] + ' (' + lst[1] + ')' return result def make_html_list(s): if s.strip() == '': return 'None' result = '

' return result def make_html_page(lst, fname): tf = lst[1].split()[0] # ID only, no name target = lst[0].split()[0] head = '%s\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1]) head += '\n\n\n' s = '' s += '%s\n' % (head) body = '

TF: %s

\n' % (split_id_and_name(lst[1])) body += '

Target: %s

\n' % (split_id_and_name(lst[0])) body += '

Association strength: %s

\n' % (lst[8]) body += '

Edge made on %s.

\n' % (add_dashes_to_date(lst[7])) body += '

Methods: %s

\n' % (make_html_list(lst[9])) body += '

Evidence of binding: %s.

\n' % (lst[5] if lst[5] != '.' else 'TBA') body += 'Gene expression scatter plot\n' % (tf, target) body += '

For more detailed analysis, download our gene expression scatter plotting tool. No installation is required. Input: TF gene expression Target gene expression RNA-seq annotation

\n' % (tf, target) body += '

\n' s += '%s\n' % (body) s += '' f = open(fname, 'w') f.write(s) f.close() def fill_database(lst, cursor): ''' Store all edge information in a SQLite database, which can be retrieved in the Webapp.''' tf = lst[1].split()[0] # ID only, no name tf_name = split_id_and_name(lst[1]) target = lst[0].split()[0] target_name = split_id_and_name(lst[0]) strength = lst[8] edge_date = add_dashes_to_date(lst[7]) method = make_html_list(lst[9]) evidence = lst[5] if lst[5] != '.' else 'TBA' cursor.execute('INSERT INTO edge (target_id, target_name, tf_id, tf_name, strength, date, method, evidence) VALUES (?,?,?,?,?,?,?,?)', (target, target_name, tf, tf_name, strength, edge_date, method, evidence)) def compute_time_difference_in_days(t1, t2): ''' t1 and t2 has this format: yyyymmdd. ''' if not t1.isdigit() and length(t1) != 8: raise Exception('t1 format wrong in compute_time_difference_in_days.') if not t2.isdigit() and length(t2) != 8: raise Exception('t2 format wrong in compute_time_difference_in_days.') t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:])) t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:])) return (t1 - t2).days def get_unique_cids(lst): ''' Return a list of unique, sorted ChIP-seq IDs. ''' cids = [] for x in lst: sublst = x.split() cids.extend(sublst) result = sorted(list(set(cids))) if len(result) > 1 and result[0] == '.': result.pop(0) return ' '.join(result) def make_new_edge(d): best_edge = list(d['best_edge']) S = 365 * 10 curr_date = datetime.datetime.now().strftime('%Y%m%d') most_recent_edge_date = d['stat']['most_recent_edge_date'] time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) F = d['stat']['F'] strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) best_edge[4] = '%d' % d['stat']['maxRN'] best_edge[5] = get_unique_cids(d['stat']['allCID']) best_edge[7] = most_recent_edge_date best_edge[8] = '%.2f' % strength method_or_tissue = d['stat']['method_or_tissue'] best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) return best_edge ##main write_log_file('[merge_edges.py] Go through all edge files in the edge pool %s.' % (EDGE_POOL_DIR) , UPDATE_NETWORK_LOG_FILE) d = {} # d will contain all edges computed so far, where the key is TargetGeneID_TFGeneID, and the value is a list of tuples. Each tuple is a historical edge. file_count = 0 for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))): file_count += 1 print('[merge_edges.py] Including %s. Dictionary size %d.' % (fname, len(d))) #write_log_file('[merge_edges.py] including file %s.' % (fname) , UPDATE_NETWORK_LOG_FILE) f = open(fname) for line in f: line = line.strip() if len(line.split('\t')) == 10: # a valid edge line has 10 fields, separated by a TAB lst = line.split('\t') target = lst[0] tf = lst[1] score = lst[2] type_of_score = lst[3] rids = lst[4] cids = lst[5] ll = lst[6] date = lst[7] strength = lst[8] method_or_tissue = lst[9] key = target.split()[0] + '_' + tf.split()[0] # target or tf has two fields, Gene ID and Gene Name, split()[0] means using Gene ID only. t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) if not key in d: d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}} else: # update best edge old_score = float(d[key]['best_edge'][2]) new_score = float(score) if abs(new_score) > abs(old_score): d[key]['best_edge'] = t # update stat information d[key]['stat']['sumr'] += abs(new_score) d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids) d[key]['stat']['F'] += 1 if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']: d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids) if date > d[key]['stat']['most_recent_edge_date']: d[key]['stat']['most_recent_edge_date'] = date if not cids in d[key]['stat']['allCID']: d[key]['stat']['allCID'].append(cids) if not method_or_tissue in d[key]['stat']['method_or_tissue']: d[key]['stat']['method_or_tissue'].append(method_or_tissue) f.close() write_log_file('[merge_edges.py] BRAIN has collected edges from %d files.' % (file_count) , UPDATE_NETWORK_LOG_FILE) # make html pages folder_path = '../Data/temp/html_edges' if not os.path.isdir(folder_path): os.mkdir(folder_path) write_log_file('[merge_edges.py] Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE) fout = open(MERGED_EDGE_FILE, 'w') for k in d: lst = make_new_edge(d[k]) fout.write('\t'.join(lst) + '\n') fout.close() db_fname = os.path.join(folder_path, 'edges.sqlite') write_log_file('[merge_edges.py] Make SQLite database file %s.' % (db_fname), UPDATE_NETWORK_LOG_FILE) if os.path.exists(db_fname): os.remove(db_fname) conn = sqlite3.connect(db_fname) c = conn.cursor() c.execute('CREATE TABLE IF NOT EXISTS edge (target_id text, target_name text, tf_id text, tf_name text, strength text, date text, method text, evidence text)') for k in d: lst = make_new_edge(d[k]) # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method # to save space. # pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html # make_html_page(lst, folder_path + '/' + pagename) # Write to a SQLite database file called edges.sqlite, which will be used for the Webapp. # edges.sqlite will be put under static/edges/ for querying. fill_database(lst, c) conn.commit() conn.close()