# Usage: python make_parameter_dapseq3.py # Because dap-seq dose not include all TFs, so include other TFs. The idea is that TFs within a same family are very conservative in binding. # import sys, glob, os, operator from geneid2name import make_gene_name_AGI_map_dict DAPSEQ_DIR = '/home/hui/network/dapseq_merged' MAP_FILE = '/home/hui/network/dapseq_merged/tffamily.simple.txt' def get_name(s): lst = s.split('_') result = [] for x in lst: if x != 'tnt': result.append(x) return '_'.join(result) def make_dapseq_dictionary(dirname): d = {} files = glob.glob(os.path.join(dirname, '*.narrowPeak')) for f in files: lst = f.split('/') tf_name = lst[-1].split('.')[0] tf_name = get_name(tf_name) if not tf_name in d: d[tf_name] = f else: print('ERROR: transcription factor name not unique.') sys.exit() return d d = make_dapseq_dictionary(DAPSEQ_DIR) f = open(MAP_FILE) lines = f.readlines() f.close() # since MAP_FILE contain duplicate lines d_family = {} for line in lines: line = line.strip() lst = line.split() tf = lst[0].upper() tf_name = lst[1] family = lst[2] if not tf in d_family: d_family[tf] = (tf_name, family) else: if family != d_family[tf][1]: print('WARNING: %s conflict [%s %s]!' % (tf, family, d_family[tf][1])) count = 1 for k in sorted(d_family.keys()): g = k gname = d_family[k][0] key = d_family[k][1] if key in d: cid = 'C0003%09d' % (count) count += 1 print('@%s' % (cid)) print('PROTEIN_ID:%s' % (g)) print('PROTEIN_NAME:%s' % (gname)) print('DATA_NAME:%s' % (gname)) print('DATA_FORMAT:%s' % ('narrowPeak')) print('DESCRIPTION:inferred from dapseq') #print('LOCATION:%s' % (os.path.join(DAPSEQ_DIR, d[key]))) print('LOCATION:%s' % (d[key])) print('NOTE:') print('')