summaryrefslogtreecommitdiff
path: root/Code/count_AGI_in_cDNA.py
blob: b3c459d17cdd66b19c8fa865c8606d67c787dde6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Usage: python count_AGI_in_cDNA.py
#
# Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa.  An AGI looks like AT4G28720.
#          Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa.
# 
# Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn)

cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
f = open(cdna)
d = {}  # hold all unique AGIs as keys
for line in f:
    line = line.strip()
    if line.startswith('>AT'):
        lst = line.split()
        gene_id = lst[0][1:] # remove '>'
        key = gene_id.split('.')[0] # discard '.1' 
        d[key] = 1
        
f.close()
print(len(d))

thermomorphogenesis_genes = [
    'AT4G28720',
    'AT2G25930',
    'AT2G40080',
    'AT3G46640',
    'AT5G11260',
    'AT2G43010',
    'AT3G59060',
    'AT4G10180',
    'AT2G32950',
    'AT3G13550',
    'AT4G05420',
    'AT4G21100',
    'AT2G46340',
    'AT4G11110',
    'AT3G15354',
    'AT1G53090',
    'AT1G02340',
    'AT4G08920',
    'AT4G39950',
    'AT2G22330',
    'AT2G42870',
    'AT5G39860',
    'AT1G70560',
    'AT3G62980',
    'AT4G03190',
    'AT3G26810',
    'AT1G12820',
    'AT4G24390',
    'AT5G49980',
    'AT5G01830',
    'AT5G18010',
    'AT5G18020',
    'AT5G18050',
    'AT5G18060',
    'AT5G18080',
    'AT1G29440',
    'AT1G29510',
    'AT4G18710',
    'AT1G75080',
    'AT1G30330',
    'AT1G19850',
    'AT3G33520',
    'AT4G16280',
    'AT2G43060',
    'AT2G18300',
    'AT4G16780',
    'AT1G01060',
    'AT1G22770',
    'AT4G25420',
    'AT1G15550',
    'AT1G78440',
    'AT5G43700',
    'AT4G32280',
    'AT2G38120',
    'AT1G15580',
]

# If a thermomorphogenesis is not in d, then print it.
for g in thermomorphogenesis_genes:
    if not g in d:
        print(g)