diff options
Diffstat (limited to 'Code')
-rw-r--r-- | Code/count_AGI_in_cDNA.py | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/Code/count_AGI_in_cDNA.py b/Code/count_AGI_in_cDNA.py new file mode 100644 index 0000000..b3c459d --- /dev/null +++ b/Code/count_AGI_in_cDNA.py @@ -0,0 +1,83 @@ +# Usage: python count_AGI_in_cDNA.py +# +# Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa. An AGI looks like AT4G28720. +# Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa. +# +# Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn) + +cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa' +f = open(cdna) +d = {} # hold all unique AGIs as keys +for line in f: + line = line.strip() + if line.startswith('>AT'): + lst = line.split() + gene_id = lst[0][1:] # remove '>' + key = gene_id.split('.')[0] # discard '.1' + d[key] = 1 + +f.close() +print(len(d)) + +thermomorphogenesis_genes = [ + 'AT4G28720', + 'AT2G25930', + 'AT2G40080', + 'AT3G46640', + 'AT5G11260', + 'AT2G43010', + 'AT3G59060', + 'AT4G10180', + 'AT2G32950', + 'AT3G13550', + 'AT4G05420', + 'AT4G21100', + 'AT2G46340', + 'AT4G11110', + 'AT3G15354', + 'AT1G53090', + 'AT1G02340', + 'AT4G08920', + 'AT4G39950', + 'AT2G22330', + 'AT2G42870', + 'AT5G39860', + 'AT1G70560', + 'AT3G62980', + 'AT4G03190', + 'AT3G26810', + 'AT1G12820', + 'AT4G24390', + 'AT5G49980', + 'AT5G01830', + 'AT5G18010', + 'AT5G18020', + 'AT5G18050', + 'AT5G18060', + 'AT5G18080', + 'AT1G29440', + 'AT1G29510', + 'AT4G18710', + 'AT1G75080', + 'AT1G30330', + 'AT1G19850', + 'AT3G33520', + 'AT4G16280', + 'AT2G43060', + 'AT2G18300', + 'AT4G16780', + 'AT1G01060', + 'AT1G22770', + 'AT4G25420', + 'AT1G15550', + 'AT1G78440', + 'AT5G43700', + 'AT4G32280', + 'AT2G38120', + 'AT1G15580', +] + +# If a thermomorphogenesis is not in d, then print it. +for g in thermomorphogenesis_genes: + if not g in d: + print(g) |