diff options
-rw-r--r-- | Code/merge_edges.py | 56 |
1 files changed, 51 insertions, 5 deletions
diff --git a/Code/merge_edges.py b/Code/merge_edges.py index ffe338c..78bef5c 100644 --- a/Code/merge_edges.py +++ b/Code/merge_edges.py @@ -152,6 +152,34 @@ def make_new_edge(lst_tuple): return best_edge +def get_unique_cids(lst): + ''' Return a list of unique, sorted ChIP-seq IDs. ''' + cids = [] + for x in lst: + sublst = x.split() + cids.extend(sublst) + result = sorted(list(set(cids))) + if len(result) > 1 and result[0] == '.': + result.pop(0) + return ' '.join(result) + + +def make_new_edge2(d): + best_edge = list(d['best_edge']) + S = 365 * 10 + curr_date = datetime.datetime.now().strftime('%Y%m%d') + most_recent_edge_date = d['stat']['most_recent_edge_date'] + time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date) + F = d['stat']['F'] + strength = (d['stat']['sumr']/F) * math.log(d['stat']['sumRN']/F + 1, 10) * math.log(F+1, 2) * math.exp(time_diff/S) + best_edge[4] = d['stat']['maxRN'] + best_edge[5] = get_unique_cids(d['stat']['allCID']) + best_edge[7] = most_recent_edge_date + best_edge[8] = '%.2f' % strength + method_or_tissue = d['stat']['method_or_tissue'] + best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) + return best_edge + ##main @@ -182,9 +210,27 @@ for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, 'edges*.*'))): t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue) if not key in d: - d[key] = [t] - elif not t in d[key]: # make sure the tuple to be added to d[key] (a list) does not alreay exist. - d[key].append(t) + d[key] = {'best_edge':t, 'stat':{'sumr':abs(float(score)) , 'allCID':[cids], 'maxRN':get_number_of_RNAseq_ids(rids), 'sumRN':get_number_of_RNAseq_ids(rids), 'F':1, 'most_recent_edge_date':date, 'method_or_tissue':[method_or_tissue]}} + else: + # update best edge + old_score = float(d[key]['best_edge'][2]) + new_score = float(score) + if abs(new_score) > abs(old_score): + d[key]['best_edge'] = t + + # update stat information + d[key]['stat']['sumr'] += abs(new_score) + d[key]['stat']['sumRN'] += get_number_of_RNAseq_ids(rids) + d[key]['stat']['F'] += 1 + + if get_number_of_RNAseq_ids(rids) > d[key]['stat']['maxRN']: + d[key]['stat']['maxRN'] = get_number_of_RNAseq_ids(rids) + if date > d[key]['stat']['most_recent_edge_date']: + d[key]['stat']['most_recent_edge_date'] = date + if not cids in d[key]['stat']['allCID']: + d[key]['stat']['allCID'].append(cids) + if not method_or_tissue in d[key]['stat']['method_or_tissue']: + d[key]['stat']['method_or_tissue'].append(method_or_tissue) f.close() @@ -199,7 +245,7 @@ if not os.path.isdir(folder_path): write_log_file('[merge_edges.py]: Make text edge file %s ...' % (MERGED_EDGE_FILE), UPDATE_NETWORK_LOG_FILE) fout = open(MERGED_EDGE_FILE, 'w') for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) fout.write('\t'.join(lst) + '\n') fout.close() @@ -213,7 +259,7 @@ if os.path.exists(db_fname): conn = sqlite3.connect(db_fname) for k in d: - lst = make_new_edge(d[k]) + lst = make_new_edge2(d[k]) # Make an html page for each edge (taking Big disk space). This will take about 5GB disk space # for 1.3 million edges, not very disk space friendly. So I use a database-driven dynamic method # to save space. |