From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan <lanhui@zjnu.edu.cn> Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/make_parameter_dapseq3.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 Code/make_parameter_dapseq3.py (limited to 'Code/make_parameter_dapseq3.py') diff --git a/Code/make_parameter_dapseq3.py b/Code/make_parameter_dapseq3.py new file mode 100644 index 0000000..405b3dc --- /dev/null +++ b/Code/make_parameter_dapseq3.py @@ -0,0 +1,75 @@ +# Usage: python make_parameter_dapseq3.py +# Because dap-seq dose not include all TFs, so include other TFs. The idea is that TFs within a same family are very conservative in binding. +# + +import sys, glob, os, operator +from geneid2name import make_gene_name_AGI_map_dict + +DAPSEQ_DIR = '/home/hui/network/dapseq_merged' +MAP_FILE = '/home/hui/network/dapseq_merged/tffamily.simple.txt' + +def get_name(s): + lst = s.split('_') + result = [] + for x in lst: + if x != 'tnt': + result.append(x) + return '_'.join(result) + +def make_dapseq_dictionary(dirname): + + d = {} + + files = glob.glob(os.path.join(dirname, '*.narrowPeak')) + + for f in files: + lst = f.split('/') + tf_name = lst[-1].split('.')[0] + tf_name = get_name(tf_name) + if not tf_name in d: + d[tf_name] = f + else: + print('ERROR: transcription factor name not unique.') + sys.exit() + + return d + + +d = make_dapseq_dictionary(DAPSEQ_DIR) + +f = open(MAP_FILE) +lines = f.readlines() +f.close() + +# since MAP_FILE contain duplicate lines +d_family = {} +for line in lines: + line = line.strip() + lst = line.split() + tf = lst[0].upper() + tf_name = lst[1] + family = lst[2] + if not tf in d_family: + d_family[tf] = (tf_name, family) + else: + if family != d_family[tf][1]: + print('WARNING: %s conflict [%s %s]!' % (tf, family, d_family[tf][1])) + +count = 1 +for k in sorted(d_family.keys()): + g = k + gname = d_family[k][0] + key = d_family[k][1] + if key in d: + cid = 'C0003%09d' % (count) + count += 1 + print('@%s' % (cid)) + print('PROTEIN_ID:%s' % (g)) + print('PROTEIN_NAME:%s' % (gname)) + print('DATA_NAME:%s' % (gname)) + print('DATA_FORMAT:%s' % ('narrowPeak')) + print('DESCRIPTION:inferred from dapseq') + #print('LOCATION:%s' % (os.path.join(DAPSEQ_DIR, d[key]))) + print('LOCATION:%s' % (d[key])) + print('NOTE:') + print('') -- cgit v1.2.1