# Usage: python exclude_edges.py edges.txt # # Purpose: Exclude the edge whose TF is in exclude_tf_list, since we want to hide unpublished data. Don't distinguish +/- edges. # For a TF-Target pair, use the edge that has largest value of metric. If correlation is negative, use its absolute value. # # Created by Hui on 5 Jan 2018 import os, sys def remove_minus(s): ''' Remove the minus sign in s ''' index = s.find('-') return s[index+1:] def neg2pos(s): s = s.strip() lst = s.split('\t') x = float(lst[2]) if x < 0: lst[2] = remove_minus(lst[2]) return '\t'.join(lst) def make_edge_dict(fname, exclude_lst): d = {} f = open(fname) for line in f: line = line.strip() lst = line.split('\t') tf_id = lst[1].split()[0] if not tf_id in exclude_lst: target_id = lst[0].split()[0] metric = float(lst[8]) k = tf_id + '.' + target_id if not k in d: d[k] = {} d[k]['metric'] = metric d[k]['line'] = neg2pos(line) # make the third field (correlation) positive if it is negative. Indicate influence, not activation/repression. else: if d[k]['metric'] < metric: d[k]['metric'] = metric d[k]['line'] = neg2pos(line) f.close() return d # main exclude_tf_list = ['AT4G26840', 'AT3G18550'] edge_file_name = sys.argv[1] d = make_edge_dict(edge_file_name, exclude_tf_list) f = open(edge_file_name, 'w') # this will make edge.txt empty for k in sorted(d.keys()): f.write('%s\n' % d[k]['line']) f.close()