summaryrefslogtreecommitdiff
path: root/Code/prepare_gene_file.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/prepare_gene_file.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/prepare_gene_file.py')
-rw-r--r--Code/prepare_gene_file.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/Code/prepare_gene_file.py b/Code/prepare_gene_file.py
new file mode 100644
index 0000000..febcbef
--- /dev/null
+++ b/Code/prepare_gene_file.py
@@ -0,0 +1,79 @@
+# Usage: python prepare_gene_file.py all-ath-gene-position.txt > gene_file.txt
+# all-ath-gene-position contains all gene IDs in the genome, and their positions, orientation, etc, in BED format.
+# See ../Data/information/all-ath-gene-position.txt
+# Purpose: get gene name and gene annotation
+# 2 JAN 2017 hui SLCU
+
+import sys
+import os
+
+###################################################################################
+GENE_DESCRIPTION = '../Data/information/gene_description_20140101.txt'
+AGI_TO_GENE_NAMES = '../Data/information/AGI-to-gene-names.txt'
+###################################################################################
+
+
+def get_description(x, d):
+ result = ''
+ if x in d:
+ result = '\t' + d[x]
+ else:
+ result = '\tNot Found'
+ return result
+
+
+def make_AGI_to_gene_name_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ agi = lst[0]
+ name = lst[1]
+ if not agi in d:
+ d[agi] = name
+ else:
+ d[agi] += ';' + name
+ f.close()
+ return d
+
+
+## main
+
+# make a dictionary of gene description
+f0 = open(GENE_DESCRIPTION)
+d = {}
+for line in f0:
+ line = line.strip()
+ lst = line.split('\t')
+ id = lst[0]
+ id = id[0:9] # AGI id, omit .1, .2, .3, etc
+ s = '\t'.join(lst[1:])
+ if not id in d:
+ d[id] = s
+ else:
+ d[id] += '\t' + s
+
+f0.close()
+
+agi2genename_dict = make_AGI_to_gene_name_dict(AGI_TO_GENE_NAMES)
+
+locus_file = sys.argv[1] # location of genes
+f = open(locus_file) # see all-ath-gene-position.txt
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ x = lst[3]
+ c = lst[0]
+ ss = lst[1]
+ ee = lst[2]
+ strand = lst[5]
+ result = [x]
+ if x in agi2genename_dict and not x == agi2genename_dict[x]:
+ result.append(agi2genename_dict[x])
+ else:
+ result.append(' ') # if no gene name, use a space
+ result.extend([c, ss, ee, strand, get_description(x, d)])
+ print('\t'.join(result))
+
+f.close()