summaryrefslogtreecommitdiff
path: root/Code/count_AGI_in_cDNA.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/count_AGI_in_cDNA.py')
-rw-r--r--Code/count_AGI_in_cDNA.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/Code/count_AGI_in_cDNA.py b/Code/count_AGI_in_cDNA.py
new file mode 100644
index 0000000..b3c459d
--- /dev/null
+++ b/Code/count_AGI_in_cDNA.py
@@ -0,0 +1,83 @@
+# Usage: python count_AGI_in_cDNA.py
+#
+# Purpose: print the number of unique AGIs in Arabidopsis_thaliana.TAIR10.cdna.all.fa. An AGI looks like AT4G28720.
+# Check that all thermomorphogenesis genes are in Arabidopsis_thaliana.TAIR10.cdna.all.fa.
+#
+# Created on 5 December 2019 by Hui Lan (lanhui@zjnu.edu.cn)
+
+cdna = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
+f = open(cdna)
+d = {} # hold all unique AGIs as keys
+for line in f:
+ line = line.strip()
+ if line.startswith('>AT'):
+ lst = line.split()
+ gene_id = lst[0][1:] # remove '>'
+ key = gene_id.split('.')[0] # discard '.1'
+ d[key] = 1
+
+f.close()
+print(len(d))
+
+thermomorphogenesis_genes = [
+ 'AT4G28720',
+ 'AT2G25930',
+ 'AT2G40080',
+ 'AT3G46640',
+ 'AT5G11260',
+ 'AT2G43010',
+ 'AT3G59060',
+ 'AT4G10180',
+ 'AT2G32950',
+ 'AT3G13550',
+ 'AT4G05420',
+ 'AT4G21100',
+ 'AT2G46340',
+ 'AT4G11110',
+ 'AT3G15354',
+ 'AT1G53090',
+ 'AT1G02340',
+ 'AT4G08920',
+ 'AT4G39950',
+ 'AT2G22330',
+ 'AT2G42870',
+ 'AT5G39860',
+ 'AT1G70560',
+ 'AT3G62980',
+ 'AT4G03190',
+ 'AT3G26810',
+ 'AT1G12820',
+ 'AT4G24390',
+ 'AT5G49980',
+ 'AT5G01830',
+ 'AT5G18010',
+ 'AT5G18020',
+ 'AT5G18050',
+ 'AT5G18060',
+ 'AT5G18080',
+ 'AT1G29440',
+ 'AT1G29510',
+ 'AT4G18710',
+ 'AT1G75080',
+ 'AT1G30330',
+ 'AT1G19850',
+ 'AT3G33520',
+ 'AT4G16280',
+ 'AT2G43060',
+ 'AT2G18300',
+ 'AT4G16780',
+ 'AT1G01060',
+ 'AT1G22770',
+ 'AT4G25420',
+ 'AT1G15550',
+ 'AT1G78440',
+ 'AT5G43700',
+ 'AT4G32280',
+ 'AT2G38120',
+ 'AT1G15580',
+]
+
+# If a thermomorphogenesis is not in d, then print it.
+for g in thermomorphogenesis_genes:
+ if not g in d:
+ print(g)