# Usage: python prepare_gene_file.py all-ath-gene-position.txt > gene_file.txt # all-ath-gene-position contains all gene IDs in the genome, and their positions, orientation, etc, in BED format. # See ../Data/information/all-ath-gene-position.txt # Purpose: get gene name and gene annotation # 2 JAN 2017 hui SLCU import sys import os ################################################################################### GENE_DESCRIPTION = '../Data/information/gene_description_20140101.txt' AGI_TO_GENE_NAMES = '../Data/information/AGI-to-gene-names.txt' ################################################################################### def get_description(x, d): result = '' if x in d: result = '\t' + d[x] else: result = '\tNot Found' return result def make_AGI_to_gene_name_dict(fname): d = {} f = open(fname) for line in f: line = line.strip() lst = line.split() agi = lst[0] name = lst[1] if not agi in d: d[agi] = name else: d[agi] += ';' + name f.close() return d ## main # make a dictionary of gene description f0 = open(GENE_DESCRIPTION) d = {} for line in f0: line = line.strip() lst = line.split('\t') id = lst[0] id = id[0:9] # AGI id, omit .1, .2, .3, etc s = '\t'.join(lst[1:]) if not id in d: d[id] = s else: d[id] += '\t' + s f0.close() agi2genename_dict = make_AGI_to_gene_name_dict(AGI_TO_GENE_NAMES) locus_file = sys.argv[1] # location of genes f = open(locus_file) # see all-ath-gene-position.txt for line in f: line = line.strip() lst = line.split('\t') x = lst[3] c = lst[0] ss = lst[1] ee = lst[2] strand = lst[5] result = [x] if x in agi2genename_dict and not x == agi2genename_dict[x]: result.append(agi2genename_dict[x]) else: result.append(' ') # if no gene name, use a space result.extend([c, ss, ee, strand, get_description(x, d)]) print('\t'.join(result)) f.close()