1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# Usage: python count_AGI_in_cDNA.py
#
# Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa. An AGI looks like AT4G28720.
# Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa.
#
# Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn)
cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
f = open(cdna)
d = {} # hold all unique AGIs as keys
for line in f:
line = line.strip()
if line.startswith('>AT'):
lst = line.split()
gene_id = lst[0][1:] # remove '>'
key = gene_id.split('.')[0] # discard '.1'
d[key] = 1
f.close()
print(len(d))
thermomorphogenesis_genes = [
'AT4G28720',
'AT2G25930',
'AT2G40080',
'AT3G46640',
'AT5G11260',
'AT2G43010',
'AT3G59060',
'AT4G10180',
'AT2G32950',
'AT3G13550',
'AT4G05420',
'AT4G21100',
'AT2G46340',
'AT4G11110',
'AT3G15354',
'AT1G53090',
'AT1G02340',
'AT4G08920',
'AT4G39950',
'AT2G22330',
'AT2G42870',
'AT5G39860',
'AT1G70560',
'AT3G62980',
'AT4G03190',
'AT3G26810',
'AT1G12820',
'AT4G24390',
'AT5G49980',
'AT5G01830',
'AT5G18010',
'AT5G18020',
'AT5G18050',
'AT5G18060',
'AT5G18080',
'AT1G29440',
'AT1G29510',
'AT4G18710',
'AT1G75080',
'AT1G30330',
'AT1G19850',
'AT3G33520',
'AT4G16280',
'AT2G43060',
'AT2G18300',
'AT4G16780',
'AT1G01060',
'AT1G22770',
'AT4G25420',
'AT1G15550',
'AT1G78440',
'AT5G43700',
'AT4G32280',
'AT2G38120',
'AT1G15580',
]
# If a thermomorphogenesis is not in d, then print it.
for g in thermomorphogenesis_genes:
if not g in d:
print(g)
|