From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/make_parameter_bw.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 Code/make_parameter_bw.py (limited to 'Code/make_parameter_bw.py') diff --git a/Code/make_parameter_bw.py b/Code/make_parameter_bw.py new file mode 100644 index 0000000..bfb7074 --- /dev/null +++ b/Code/make_parameter_bw.py @@ -0,0 +1,127 @@ +# Usage: python make_parameter_bw.py +# Edit the variable BW_PATHS, which is a file containing (almost all) paths to in-house bw files. Edit PARENT_DIR, which +# will be used to make full path to the bw files. Amend name_map_dict. +# Purpose +# ------- +# Make a parameter file for buildCmatrix.py. For example: +# +# @C0001100007141 +# PROTEIN_ID: +# PROTEIN_NAME:HTR13 +# DATA_NAME:2_71C_10_IP-HTR13-17c +# DATA_FORMAT:bw +# DESCRIPTION:in house ChIP data +# LOCATION:/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data/2_71C/20150603_H3_2nd_rep_mapped_bw/ChIP-seq_h31_h33_col_rep1_20150607_max_fragment_500_bw/2_71C_10_IP-HTR13-17c_raw_trimmo_paired_truseq3-PE-2_2_10_5_1_bowtie2_TAIR10_ensembl_nomixed_sorted_rmdup_picard_genomenorm.bw +# NOTE: +# +# 29 NOV 2016, hui, home + +import sys, glob, os, operator +from geneid2name import make_gene_name_AGI_map_dict + +BW_PATHS = 'bwfiles_unique.txt' # Ideally, the input file should be sorted by library number, then sample number. +PARENT_DIR = '/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data' + +GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' + +def get_library_id(s): + index = s.find('C') # in-house chip library id ends with a letter C + if index > 0: + return s[:index] + else: + return s + +def get_bw_file_name(lst): + for x in lst: + if '.bw' in x: + return x + return '' + +def get_sample_name(s): + index = s.find('_raw') # a bw file name contains _raw, get the part before _raw if _raw exists + if index > 0: + return s[:index] + else: + return s + +def get_sample_number(s): + index = s.find('_S') # sample number is proceeded by _S + if index > 0: + if s[index+2].isdigit() and s[index+3].isdigit(): # two digit sample number + return s[index+2:index+4] + else: # one digit sample number + return '0' + s[index+2] + return '25' + +def convert_name(s, d): + ''' convert s to gene id ''' + if s.upper() in d: + return d[s.upper()] + else: + return ' ' + +def get_gene_id_and_name(s, d): + ''' Return a dictionary value, protein name and protein id given a sample file name.''' + for k in sorted(d.keys(),reverse=True): + if k.isdigit() and k.lower() in s.lower(): # k is a number, e.g., 833 + return (d[k], convert_name(d[k], agi2name_dict)) + if k.lower() in s.lower(): + return (k, convert_name(k, agi2name_dict)) + return ('name_unknown', 'id_unknown') + +### + +# key is a name of sample, value is either empty if it is a protein name, or the protein name if the key is a database number, e.g., 833. If key is protein name, the program tries to find its gene id (AT...). If key is a database number, the program gets its protein name and then tries to find its gene id. If not found, PROTEIN_ID or PROTEIN_NAME will be assigned _unknown. So search _unknown to manually annotate PROTEIN_ID or PROTEIN_NAME. +name_map_dict = {'HTA11':'', 'H3':'', 'HTR13':'', 'HSP70':'', 'KIN10':'', 'EPM1':'', 'ELF3':'', 'phyB':'', '833':'KIN10', 'hos':'', 'HOS':'', 'HOS1':'', 'LUX':'', 'YHB':'','HSF1':'','3129':'HSP90','838':'phyB', '1506':'HSF1', 'SUMO':'', 'TIR1':'', 'PIF4':'', 'PIF':'','H2A':'', 'H2AZ':'', 'MNase':'', '544':'ELF4', '834':'PIF4', '745':'ELF3', 'EC1_S1':'', 'EC1_S1':'ELF3', 'EC2_S2':'ELF3', '1166':'LUX', '1167':'LUX', '3239':'REV', '1281':'MPK6', '1278':'SEP3', '1279':'FD', '1280':'FD-like', '1283':'HSP70', '1284':'DELLA', '1762':'FT', 'LFY':'', 'TFL1':''} + +agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) +f = open(BW_PATHS) + +d_cid = {} +for line in f: + line = line.strip() + lst = line.split('/') + bwfile = get_bw_file_name(lst) + sample_name = get_sample_name(bwfile) + sample_no = get_sample_number(bwfile) + library_id = get_library_id(lst[0]) + name, gid = get_gene_id_and_name(sample_name, name_map_dict) + path = os.path.join(PARENT_DIR, line) + + if '2_' in library_id: # some in-house library id starts with 2_, indicating that it is a replicate. + index = library_id.find('_') + library_id = library_id[index+1:] + cid = '@C00011000%s%s' % (library_id.zfill(3), sample_no) # the highest digit of the last 9 digits is 1 to indicate that it is a replicate. library number, 3 digits. sample number, 2 digits. + else: + cid = '@C00010000%s%s' % (library_id.zfill(3), sample_no) + + # handle duplicate library id + if not cid in d_cid: + d_cid[cid] = 1 + else: # produce a new library id + d_cid[cid] += 1 + count = 26 + head = cid[-2:] # last two digits + while True: + new_cid = cid[:-2] + '%02d' % (count) + if count >= 100: + print('Error: count must be less than 100.') + sys.exit() + count += 1 + if not new_cid in d_cid: + break + cid = new_cid + d_cid[cid] = 1 + + # print contents for the parameter file + print(cid) + print('PROTEIN_ID:%s' % (gid)) + print('PROTEIN_NAME:%s' % (name)) + print('DATA_NAME:%s' % (sample_name)) + print('DATA_FORMAT:%s' % ('bw')) + print('DESCRIPTION:in house ChIP data') + print('LOCATION:%s' % (path)) + print('NOTE:') + print('') + +f.close() -- cgit v1.2.1