diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/prepare_gene_file.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/prepare_gene_file.py')
-rw-r--r-- | Code/prepare_gene_file.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/Code/prepare_gene_file.py b/Code/prepare_gene_file.py new file mode 100644 index 0000000..febcbef --- /dev/null +++ b/Code/prepare_gene_file.py @@ -0,0 +1,79 @@ +# Usage: python prepare_gene_file.py all-ath-gene-position.txt > gene_file.txt +# all-ath-gene-position contains all gene IDs in the genome, and their positions, orientation, etc, in BED format. +# See ../Data/information/all-ath-gene-position.txt +# Purpose: get gene name and gene annotation +# 2 JAN 2017 hui SLCU + +import sys +import os + +################################################################################### +GENE_DESCRIPTION = '../Data/information/gene_description_20140101.txt' +AGI_TO_GENE_NAMES = '../Data/information/AGI-to-gene-names.txt' +################################################################################### + + +def get_description(x, d): + result = '' + if x in d: + result = '\t' + d[x] + else: + result = '\tNot Found' + return result + + +def make_AGI_to_gene_name_dict(fname): + d = {} + f = open(fname) + for line in f: + line = line.strip() + lst = line.split() + agi = lst[0] + name = lst[1] + if not agi in d: + d[agi] = name + else: + d[agi] += ';' + name + f.close() + return d + + +## main + +# make a dictionary of gene description +f0 = open(GENE_DESCRIPTION) +d = {} +for line in f0: + line = line.strip() + lst = line.split('\t') + id = lst[0] + id = id[0:9] # AGI id, omit .1, .2, .3, etc + s = '\t'.join(lst[1:]) + if not id in d: + d[id] = s + else: + d[id] += '\t' + s + +f0.close() + +agi2genename_dict = make_AGI_to_gene_name_dict(AGI_TO_GENE_NAMES) + +locus_file = sys.argv[1] # location of genes +f = open(locus_file) # see all-ath-gene-position.txt +for line in f: + line = line.strip() + lst = line.split('\t') + x = lst[3] + c = lst[0] + ss = lst[1] + ee = lst[2] + strand = lst[5] + result = [x] + if x in agi2genename_dict and not x == agi2genename_dict[x]: + result.append(agi2genename_dict[x]) + else: + result.append(' ') # if no gene name, use a space + result.extend([c, ss, ee, strand, get_description(x, d)]) + print('\t'.join(result)) + +f.close() |