summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2020-02-15 17:40:53 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2020-02-15 17:40:53 +0800
commit9cc0f036f571a2f3722fa104638d075c78818d9f (patch)
tree7c5f539ff8aeeb795fc8765effd0e74c5b9f03a9
parenta93b3e300b20e2df9997fc3f581efee7b348aae3 (diff)
merge_edges.py: a more memory efficient method to compute an edge's net strength
Compute an edge's strength on the fly instead of saving everything and then computing the net strength. The new function make_new_edge2 will replace make_new_edge.
-rw-r--r--Code/merge_edges.py56
1 files changed, 51 insertions, 5 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
index ffe338c..78bef5c 100644
--- a/Code/merge_edges.py
+++ b/Code/merge_edges.py
@@ -152,6 +152,34 @@ def make_new_edge(lst_tuple):
return best_edge
+def get_unique_cids(lst):
+ ''' Return a list of unique, sorted ChIP-seq IDs. '''
+ cids = []
+ for x in lst:
+ sublst = x.split()
+ cids.extend(sublst)
+ result = sorted(list(set(cids)))
+ if len(result) > 1 and result[0] == '.':
+ result.pop(0)
+ return ' '.join(result)
+
+
+def make_new_edge2(d):
+ best_edge = list(d['best_edge'])
+ S = 365 * 10
+ curr_date = datetime.datetime.now().strftime('%Y%m%d')
+ most_recent_edge_date = d['stat']['most_recent_edge_date']
+ time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+ F = d['stat']['F']
+ strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+ best_edge[4] = d['stat']['maxRN']
+ best_edge[5] = get_unique_cids(d['stat']['allCID'])
+ best_edge[7] = most_recent_edge_date
+ best_edge[8] = '%.2f' % strength
+ method_or_tissue = d['stat']['method_or_tissue']
+ best_edge[9] = ','.join(sorted(list(set(method_or_tissue))))
+ return best_edge
+
##main
@@ -182,9 +210,27 @@ for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))):
t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
if not key in d:
- d[key] = [t]
- elif not t in d[key]: # make sure the tuple to be added to d[key] (a list) does not alreay exist.
- d[key].append(t)
+ d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}}
+ else:
+ # update best edge
+ old_score = float(d[key]['best_edge'][2])
+ new_score = float(score)
+ if abs(new_score) > abs(old_score):
+ d[key]['best_edge'] = t
+
+ # update stat information
+ d[key]['stat']['sumr'] += abs(new_score)
+ d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids)
+ d[key]['stat']['F'] += 1
+
+ if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']:
+ d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids)
+ if date > d[key]['stat']['most_recent_edge_date']:
+ d[key]['stat']['most_recent_edge_date'] = date
+ if not cids in d[key]['stat']['allCID']:
+ d[key]['stat']['allCID'].append(cids)
+ if not method_or_tissue in d[key]['stat']['method_or_tissue']:
+ d[key]['stat']['method_or_tissue'].append(method_or_tissue)
f.close()
@@ -199,7 +245,7 @@ if not os.path.isdir(folder_path):
write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE)
fout = open(MERGED_EDGE_FILE, 'w')
for k in d:
- lst = make_new_edge(d[k])
+ lst = make_new_edge2(d[k])
fout.write('\t'.join(lst) + '\n')
fout.close()
@@ -213,7 +259,7 @@ if os.path.exists(db_fname):
conn = sqlite3.connect(db_fname)
for k in d:
- lst = make_new_edge(d[k])
+ lst = make_new_edge2(d[k])
# Make an html page for each edge (taking Big disk space). This will take about 5GB disk space
# for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method
# to save space.