blob: 946ab6ff737718ab22bba32071f61a3de592c45b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
import sys, glob, os, operator
from geneid2name import make_gene_name_AGI_map_dict
DAPSEQ_DIR = '../Data/C/Mapped/dapseq/peaks'
GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
def make_dapseq_dictionary(dirname):
d = {}
files = glob.glob(os.path.join(dirname, '*/*/*/*.narrowPeak'))
for f in files:
lst = f.split('/')
tf_name = lst[-3]
if not tf_name in d:
d[tf_name] = f
else:
print('ERROR: transcription factor name not unique.')
sys.exit()
return d
d = make_dapseq_dictionary(DAPSEQ_DIR)
agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
count = 1
for k, v in sorted(d.items(), key=operator.itemgetter(0)):
cid = 'C0002%09d' % (count)
count += 1
print('@%s' % (cid))
g = k.split('_')[0]
g = g.upper()
if g.startswith('AT'):
print('PROTEIN_ID:%s' % (g))
elif g in agi2name_dict and g != agi2name_dict[g]:
print('PROTEIN_ID:%s' % (agi2name_dict[g]))
else:
print('PROTEIN_ID:%s' % (g))
if g.startswith('AT') and g in agi2name_dict and g != agi2name_dict[g]:
print('PROTEIN_NAME:%s' % (agi2name_dict[g]))
else:
print('PROTEIN_NAME:%s' % (g))
print('DATA_NAME:%s' % (k))
print('DATA_FORMAT:%s' % ('narrowPeak'))
print('DESCRIPTION:dapseq')
print('LOCATION:%s' % (v))
print('NOTE:')
print('')
|