diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2020-02-15 17:40:53 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2020-02-15 17:40:53 +0800 |
commit | 9cc0f036f571a2f3722fa104638d075c78818d9f (patch) | |
tree | 7c5f539ff8aeeb795fc8765effd0e74c5b9f03a9 | |
parent | a93b3e300b20e2df9997fc3f581efee7b348aae3 (diff) |
merge_edges.py: a more memory efficient method to compute an edge's net strength
Compute an edge's strength on the fly instead of saving everything and then computing the net strength.
The new function make_new_edge2 will replace make_new_edge.
-rw-r--r-- | Code/merge_edges.py | 56 |
1 files changed, 51 insertions, 5 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py index ffe338c..78bef5c 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -152,6 +152,34 @@ def make_new_edge(lst_tuple): return best_edge +def get_unique_cids(lst): + ''' Return a list of unique, sorted ChIP-seq IDs. ''' + cids = [] + for x in lst: + sublst = x.split() + cids.extend(sublst) + result = sorted(list(set(cids))) + if len(result) > 1 and result[0] == '.': + result.pop(0) + return ' '.join(result) + + +def make_new_edge2(d): + best_edge = list(d['best_edge']) + S = 365 * 10 + curr_date = datetime.datetime.now().strftime('%Y%m%d') + most_recent_edge_date = d['stat']['most_recent_edge_date'] + time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) + F = d['stat']['F'] + strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) + best_edge[4] = d['stat']['maxRN'] + best_edge[5] = get_unique_cids(d['stat']['allCID']) + best_edge[7] = most_recent_edge_date + best_edge[8] = '%.2f' % strength + method_or_tissue = d['stat']['method_or_tissue'] + best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) + return best_edge + ##main @@ -182,9 +210,27 @@ for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))): t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) if not key in d: - d[key] = [t] - elif not t in d[key]: # make sure the tuple to be added to d[key] (a list) does not alreay exist. - d[key].append(t) + d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}} + else: + # update best edge + old_score = float(d[key]['best_edge'][2]) + new_score = float(score) + if abs(new_score) > abs(old_score): + d[key]['best_edge'] = t + + # update stat information + d[key]['stat']['sumr'] += abs(new_score) + d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids) + d[key]['stat']['F'] += 1 + + if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']: + d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids) + if date > d[key]['stat']['most_recent_edge_date']: + d[key]['stat']['most_recent_edge_date'] = date + if not cids in d[key]['stat']['allCID']: + d[key]['stat']['allCID'].append(cids) + if not method_or_tissue in d[key]['stat']['method_or_tissue']: + d[key]['stat']['method_or_tissue'].append(method_or_tissue) f.close() @@ -199,7 +245,7 @@ if not os.path.isdir(folder_path): write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE) fout = open(MERGED_EDGE_FILE, 'w') for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) fout.write('\t'.join(lst) + '\n') fout.close() @@ -213,7 +259,7 @@ if os.path.exists(db_fname): conn = sqlite3.connect(db_fname) for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method # to save space. |