# Usage: python count_AGI_in_cDNA.py
#
# Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa.  An AGI looks like AT4G28720.
#          Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa.
# 
# Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn)

cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
f = open(cdna)
d = {}  # hold all unique AGIs as keys
for line in f:
    line = line.strip()
    if line.startswith('>AT'):
        lst = line.split()
        gene_id = lst[0][1:] # remove '>'
        key = gene_id.split('.')[0] # discard '.1' 
        d[key] = 1
        
f.close()
print(len(d))

thermomorphogenesis_genes = [
    'AT4G28720',
    'AT2G25930',
    'AT2G40080',
    'AT3G46640',
    'AT5G11260',
    'AT2G43010',
    'AT3G59060',
    'AT4G10180',
    'AT2G32950',
    'AT3G13550',
    'AT4G05420',
    'AT4G21100',
    'AT2G46340',
    'AT4G11110',
    'AT3G15354',
    'AT1G53090',
    'AT1G02340',
    'AT4G08920',
    'AT4G39950',
    'AT2G22330',
    'AT2G42870',
    'AT5G39860',
    'AT1G70560',
    'AT3G62980',
    'AT4G03190',
    'AT3G26810',
    'AT1G12820',
    'AT4G24390',
    'AT5G49980',
    'AT5G01830',
    'AT5G18010',
    'AT5G18020',
    'AT5G18050',
    'AT5G18060',
    'AT5G18080',
    'AT1G29440',
    'AT1G29510',
    'AT4G18710',
    'AT1G75080',
    'AT1G30330',
    'AT1G19850',
    'AT3G33520',
    'AT4G16280',
    'AT2G43060',
    'AT2G18300',
    'AT4G16780',
    'AT1G01060',
    'AT1G22770',
    'AT4G25420',
    'AT1G15550',
    'AT1G78440',
    'AT5G43700',
    'AT4G32280',
    'AT2G38120',
    'AT1G15580',
]

# If a thermomorphogenesis is not in d, then print it.
for g in thermomorphogenesis_genes:
    if not g in d:
        print(g)