import sys, glob, os, operator from geneid2name import make_gene_name_AGI_map_dict DAPSEQ_DIR = '../Data/C/Mapped/dapseq/peaks' GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' def make_dapseq_dictionary(dirname): d = {} files = glob.glob(os.path.join(dirname, '*/*/*/*.narrowPeak')) for f in files: lst = f.split('/') tf_name = lst[-3] if not tf_name in d: d[tf_name] = f else: print('ERROR: transcription factor name not unique.') sys.exit() return d d = make_dapseq_dictionary(DAPSEQ_DIR) agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) count = 1 for k, v in sorted(d.items(), key=operator.itemgetter(0)): cid = 'C0002%09d' % (count) count += 1 print('@%s' % (cid)) g = k.split('_')[0] g = g.upper() if g.startswith('AT'): print('PROTEIN_ID:%s' % (g)) elif g in agi2name_dict and g != agi2name_dict[g]: print('PROTEIN_ID:%s' % (agi2name_dict[g])) else: print('PROTEIN_ID:%s' % (g)) if g.startswith('AT') and g in agi2name_dict and g != agi2name_dict[g]: print('PROTEIN_NAME:%s' % (agi2name_dict[g])) else: print('PROTEIN_NAME:%s' % (g)) print('DATA_NAME:%s' % (k)) print('DATA_FORMAT:%s' % ('narrowPeak')) print('DESCRIPTION:dapseq') print('LOCATION:%s' % (v)) print('NOTE:') print('')