# Usage: python count_AGI_in_cDNA.py # # Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa. An AGI looks like AT4G28720. # Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa. # # Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn) cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa' f = open(cdna) d = {} # hold all unique AGIs as keys for line in f: line = line.strip() if line.startswith('>AT'): lst = line.split() gene_id = lst[0][1:] # remove '>' key = gene_id.split('.')[0] # discard '.1' d[key] = 1 f.close() print(len(d)) thermomorphogenesis_genes = [ 'AT4G28720', 'AT2G25930', 'AT2G40080', 'AT3G46640', 'AT5G11260', 'AT2G43010', 'AT3G59060', 'AT4G10180', 'AT2G32950', 'AT3G13550', 'AT4G05420', 'AT4G21100', 'AT2G46340', 'AT4G11110', 'AT3G15354', 'AT1G53090', 'AT1G02340', 'AT4G08920', 'AT4G39950', 'AT2G22330', 'AT2G42870', 'AT5G39860', 'AT1G70560', 'AT3G62980', 'AT4G03190', 'AT3G26810', 'AT1G12820', 'AT4G24390', 'AT5G49980', 'AT5G01830', 'AT5G18010', 'AT5G18020', 'AT5G18050', 'AT5G18060', 'AT5G18080', 'AT1G29440', 'AT1G29510', 'AT4G18710', 'AT1G75080', 'AT1G30330', 'AT1G19850', 'AT3G33520', 'AT4G16280', 'AT2G43060', 'AT2G18300', 'AT4G16780', 'AT1G01060', 'AT1G22770', 'AT4G25420', 'AT1G15550', 'AT1G78440', 'AT5G43700', 'AT4G32280', 'AT2G38120', 'AT1G15580', ] # If a thermomorphogenesis is not in d, then print it. for g in thermomorphogenesis_genes: if not g in d: print(g)