From 9cc0f036f571a2f3722fa104638d075c78818d9f Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Sat, 15 Feb 2020 17:40:53 +0800 Subject: merge_edges.py: a more memory efficient method to compute an edge's net strength Compute an edge's strength on the fly instead of saving everything and then computing the net strength. The new function make_new_edge2 will replace make_new_edge. --- Code/merge_edges.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/Code/merge_edges.py b/Code/merge_edges.py index ffe338c..78bef5c 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -152,6 +152,34 @@ def make_new_edge(lst_tuple): return best_edge +def get_unique_cids(lst): + ''' Return a list of unique, sorted ChIP-seq IDs. ''' + cids = [] + for x in lst: + sublst = x.split() + cids.extend(sublst) + result = sorted(list(set(cids))) + if len(result) > 1 and result[0] == '.': + result.pop(0) + return ' '.join(result) + + +def make_new_edge2(d): + best_edge = list(d['best_edge']) + S = 365 * 10 + curr_date = datetime.datetime.now().strftime('%Y%m%d') + most_recent_edge_date = d['stat']['most_recent_edge_date'] + time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) + F = d['stat']['F'] + strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) + best_edge[4] = d['stat']['maxRN'] + best_edge[5] = get_unique_cids(d['stat']['allCID']) + best_edge[7] = most_recent_edge_date + best_edge[8] = '%.2f' % strength + method_or_tissue = d['stat']['method_or_tissue'] + best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) + return best_edge + ##main @@ -182,9 +210,27 @@ for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))): t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) if not key in d: - d[key] = [t] - elif not t in d[key]: # make sure the tuple to be added to d[key] (a list) does not alreay exist. - d[key].append(t) + d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}} + else: + # update best edge + old_score = float(d[key]['best_edge'][2]) + new_score = float(score) + if abs(new_score) > abs(old_score): + d[key]['best_edge'] = t + + # update stat information + d[key]['stat']['sumr'] += abs(new_score) + d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids) + d[key]['stat']['F'] += 1 + + if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']: + d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids) + if date > d[key]['stat']['most_recent_edge_date']: + d[key]['stat']['most_recent_edge_date'] = date + if not cids in d[key]['stat']['allCID']: + d[key]['stat']['allCID'].append(cids) + if not method_or_tissue in d[key]['stat']['method_or_tissue']: + d[key]['stat']['method_or_tissue'].append(method_or_tissue) f.close() @@ -199,7 +245,7 @@ if not os.path.isdir(folder_path): write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE) fout = open(MERGED_EDGE_FILE, 'w') for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) fout.write('\t'.join(lst) + '\n') fout.close() @@ -213,7 +259,7 @@ if os.path.exists(db_fname): conn = sqlite3.connect(db_fname) for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method # to save space. -- cgit v1.2.1