From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/buildCmatrix.py | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 Code/buildCmatrix.py (limited to 'Code/buildCmatrix.py') diff --git a/Code/buildCmatrix.py b/Code/buildCmatrix.py new file mode 100644 index 0000000..5cbd30c --- /dev/null +++ b/Code/buildCmatrix.py @@ -0,0 +1,234 @@ +# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt +# +# Purpose: combine all binding columns into a matrix, binding.txt. +# Each ChIP-seq experiment has the following binding column file in +# the following format +# +# gene_id C0003000001450 +# AT1G01010 0 +# AT1G01020 0 +# AT1G01030 0 +# AT1G01040 0 +# AT1G01046 0 +# AT1G01050 0 +# AT1G01060 0 +# ... +# +# These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt +# Created on 3 JAN 2017 hui SLCU +# Last modified 5 APR 2017 SLCU hui +# Last modified 6 Aug 2019 Hui Lan [now accept a second command-line argument 'include-all'] + +import sys, os +from datetime import datetime + +#################################### +GLB_PARAM_SYMBOL = '%%' +LCL_PARAM_SYMBOL = '%' +DATA_SYMBOL = '@' # followed by data name, or condition name, should be unique +CHIP_SYMBOL = 'ChIP' # followed by ChIP peak file path +CHIP_GENE_SYMBOL = 'ChIP_GENE' # followed by ChIP protein name (in upper case, gene id or gene name) +RNA_SYMBOL = 'RNA' # followed by RNA data, a TPM table, first column is gene id, second column is TPM value +BIGWIG_SYMBOL = 'BIGWIG' # future work +DESCRI_SYMBOL = 'DESCRIPTION' # followed by data description +#################################### + + +def get_key_value(s): + lst = s.split('=', 1) + k, v = lst[0], lst[1] + return (k.strip(), v.strip()) + + +def get_value(s, delimit): + lst = s.split(delimit, 1) + return lst[1].strip() + + +def make_global_param_dict(fname): + f = open(fname) + d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change + for line in f: + line = line.strip() + if line.startswith(GLB_PARAM_SYMBOL): + s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] + lst = s.split('\t') # separate items by TAB + for x in lst: + if x != '': + k, v = get_key_value(x) + d[k] = v + if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '': + d[k] = v.split() # make a list and rewrite d[k] + elif k == 'REBUILD_LIST': + d[k] = [] + f.close() + return d + + +def make_data_dict(fname): + ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment''' + + # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 + # f = open(fname) + # lines = f.readlines() + # f.close() + # for line in lines: + # line = line.strip() + # if line == '' or line.startswith('#'): + # continue + # if line.startswith(DATA_SYMBOL): + # s = line[line.rfind(DATA_SYMBOL[-1])+1:] + # s = s.strip() # s is ChIP-seq ID + # if s in d: + # print('ID %s duplicate' % (s)) + # sys.exit() + # d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} + # d['ID_LIST'].append(s) + # if line.startswith('DESCRIPTION:'): + # d[s]['DESCRIPTION'] = get_value(line, ':') + # elif line.startswith('PROTEN_NAME:'): + # d[s]['PROTEN_NAME'] = get_value(line, ':') + # elif line.startswith('PROTEIN_ID:'): + # d[s]['PROTEIN_ID'] = get_value(line, ':') + # elif line.startswith('DATA_NAME:'): + # d[s]['DATA_NAME'] = get_value(line, ':') + # elif line.startswith('DATA_FORMAT:'): + # d[s]['DATA_FORMAT'] = get_value(line, ':') + # elif line.startswith('LOCATION:'): + # d[s]['LOCATION'] = get_value(line, ':') + # elif line.startswith('NOTE:'): + # d[s]['NOTE'] = get_value(line, ':') + # elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): + # make_local_parameter(d[s]['PARAM'], line) + + # return d + + # Essentially the same as make_data_dict from get_binding.py + d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 + f = open(fname) + lines = f.readlines() + f.close() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + if line.startswith(DATA_SYMBOL): + s = line[line.rfind(DATA_SYMBOL[-1])+1:] + s = s.strip() + if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt + sys.exit() + d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} + if line.startswith('DESCRIPTION:'): + d[s]['DESCRIPTION'] = get_value(line, ':') + elif line.startswith('PROTEN_NAME:'): + d[s]['PROTEN_NAME'] = get_value(line, ':') + elif line.startswith('PROTEIN_ID:'): + d[s]['PROTEIN_ID'] = get_value(line, ':') + elif line.startswith('DATA_NAME:'): + d[s]['DATA_NAME'] = get_value(line, ':') + elif line.startswith('DATA_FORMAT:'): + d[s]['DATA_FORMAT'] = get_value(line, ':') + elif line.startswith('LOCATION:'): + d[s]['LOCATION'] = get_value(line, ':') + if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists. It could happen the location no longer has that file. + d['ID_LIST'].append(s) + elif line.startswith('NOTE:'): + d[s]['NOTE'] = get_value(line, ':') + elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): + make_local_parameter(d[s]['PARAM'], line) + + d['ID_LIST'] = sorted(d['ID_LIST']) + return d + + +def make_chip_data(fname): + ''' convert each binding column file into a dictionary ''' + d = {} + if not os.path.exists(fname): + print('buildCmatrix: Cannot find file %s' % (fname)) + sys.exit() + + f = open(fname) + lines = f.readlines() + f.close() + + for line in lines[1:]: # ignore first line, such as 'gene_id C0001100007100' + line = line.strip() + lst = line.split() + g = lst[0] + v = lst[1] + d[g] = v + + return d + + +def get_update_date(s): + index = s.find('update:') + if index < 0: + return '00000000' + result = s[s.rfind('update:')+7:].strip() + if result.isdigit() and len(result) == 6: + return result + else: + return '00000000' + +def make_table(gene_file, data_dict, glb_param_dict): + ''' + Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional) + ''' + + if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all + id_lst_all = data_dict['ID_LIST'] + else: + id_lst_all = glb_param_dict['REBUILD_LIST'] + + # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field. + id_lst = [] + for i in id_lst_all: + note = data_dict[i]['NOTE'].lower() + curr_date = datetime.now().strftime('%Y%m%d') + include_this_id = not 'obsolete' in note \ + and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored. + if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete' + id_lst.append(i) + + if id_lst == []: + print('buildCmatrix: ChIP-seq ID list is empty.') + sys.exit() + + # head line of binding.txt + id_str = 'gene_id' + for myid in id_lst: + id_str += '\t' + myid + print(id_str) + + chip_data_dict = {} + for myid in id_lst: + chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt + chip_data = make_chip_data(chip_file) + chip_data_dict[myid] = chip_data + + f = open(gene_file) + for line in f: + line = line.strip() + lst = line.split('\t') + gene_id = lst[0] + s0 = gene_id + for myid in id_lst: + if gene_id in chip_data_dict[myid]: + s0 += '\t' + chip_data_dict[myid][gene_id] + else: + s0 += '\t' + '-1' + print(s0) + f.close() + + + +## main +param_file = sys.argv[1] +FORCE_INCLUDE_ALL = False +if len(sys.argv) > 2: + FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all' +global_param_dict = make_global_param_dict(param_file) +data_dict = make_data_dict(param_file) +make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict) -- cgit v1.2.1