merge_edges.py: a more memory efficient method to compute an edge's net strength

Compute an edge's strength on the fly instead of saving everything and then computing the net strength. The new function make_new_edge2 will replace make_new_edge.
author: Hui Lan <lanhui@zjnu.edu.cn> 2020-02-15 17:40:53 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2020-02-15 17:40:53 +0800
commit: 9cc0f036f571a2f3722fa104638d075c78818d9f (patch)
tree: 7c5f539ff8aeeb795fc8765effd0e74c5b9f03a9
parent: a93b3e300b20e2df9997fc3f581efee7b348aae3 (diff)
1 files changed, 51 insertions, 5 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
index ffe338c..78bef5c 100644
--- a/Code/merge_edges.py
+++ b/Code/merge_edges.py
@@ -152,6 +152,34 @@ def make_new_edge(lst_tuple):
     return best_edge
 
 
+def get_unique_cids(lst):
+    ''' Return a list of unique, sorted ChIP-seq IDs. '''
+    cids = []
+    for x in lst:
+        sublst = x.split()
+        cids.extend(sublst)
+    result = sorted(list(set(cids)))
+    if len(result) > 1 and result[0] == '.':
+        result.pop(0)
+    return ' '.join(result)
+
+
+def make_new_edge2(d):
+    best_edge = list(d['best_edge'])
+    S = 365 * 10
+    curr_date = datetime.datetime.now().strftime('%Y%m%d')
+    most_recent_edge_date = d['stat']['most_recent_edge_date']
+    time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+    F = d['stat']['F']
+    strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+    best_edge[4] = d['stat']['maxRN']
+    best_edge[5] = get_unique_cids(d['stat']['allCID'])
+    best_edge[7] = most_recent_edge_date
+    best_edge[8] = '%.2f' % strength
+    method_or_tissue = d['stat']['method_or_tissue']
+    best_edge[9] = ','.join(sorted(list(set(method_or_tissue))))
+    return best_edge
+    
 
 ##main
 
@@ -182,9 +210,27 @@ for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))):
             t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
 	
             if not key in d:
-                d[key] = [t]
-            elif not t in d[key]: # make sure the tuple to be added to d[key] (a list) does not alreay exist.
-                d[key].append(t)
+                d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}}
+            else:
+                # update best edge
+                old_score = float(d[key]['best_edge'][2])
+                new_score = float(score)
+                if abs(new_score) > abs(old_score):
+                    d[key]['best_edge']  = t
+
+                # update stat information
+                d[key]['stat']['sumr'] += abs(new_score)
+                d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids)
+                d[key]['stat']['F'] += 1
+
+                if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']:
+                    d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids)
+                if date > d[key]['stat']['most_recent_edge_date']:
+                    d[key]['stat']['most_recent_edge_date'] = date
+                if not cids in d[key]['stat']['allCID']:
+                    d[key]['stat']['allCID'].append(cids)
+                if not method_or_tissue in d[key]['stat']['method_or_tissue']:
+                    d[key]['stat']['method_or_tissue'].append(method_or_tissue)
 
     f.close()
 
@@ -199,7 +245,7 @@ if not os.path.isdir(folder_path):
 write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE)
 fout = open(MERGED_EDGE_FILE, 'w')
 for k in d:
-    lst = make_new_edge(d[k])
+    lst = make_new_edge2(d[k])
     fout.write('\t'.join(lst) + '\n')
 fout.close()
 
@@ -213,7 +259,7 @@ if os.path.exists(db_fname):
 
 conn = sqlite3.connect(db_fname)
 for k in d:
-    lst = make_new_edge(d[k])
+    lst = make_new_edge2(d[k])
     # Make an html page for each edge (taking Big disk space).  This will take about 5GB disk space
     # for 1.3 million edges, not very disk space friendly.  So I use a database-driven dynamic method
     # to save space.
author	Hui Lan <lanhui@zjnu.edu.cn>	2020-02-15 17:40:53 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2020-02-15 17:40:53 +0800
commit	9cc0f036f571a2f3722fa104638d075c78818d9f (patch)
tree	7c5f539ff8aeeb795fc8765effd0e74c5b9f03a9
parent	a93b3e300b20e2df9997fc3f581efee7b348aae3 (diff)