# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt # # Purpose: combine all binding columns into a matrix, binding.txt. # Each ChIP-seq experiment has the following binding column file in # the following format # # gene_id C0003000001450 # AT1G01010 0 # AT1G01020 0 # AT1G01030 0 # AT1G01040 0 # AT1G01046 0 # AT1G01050 0 # AT1G01060 0 # ... # # These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt # Created on 3 JAN 2017 hui SLCU # Last modified 5 APR 2017 SLCU hui # Last modified 6 Aug 2019 Hui Lan [now accept a second command-line argument 'include-all'] import sys, os from datetime import datetime #################################### GLB_PARAM_SYMBOL = '%%' LCL_PARAM_SYMBOL = '%' DATA_SYMBOL = '@' # followed by data name, or condition name, should be unique CHIP_SYMBOL = 'ChIP' # followed by ChIP peak file path CHIP_GENE_SYMBOL = 'ChIP_GENE' # followed by ChIP protein name (in upper case, gene id or gene name) RNA_SYMBOL = 'RNA' # followed by RNA data, a TPM table, first column is gene id, second column is TPM value BIGWIG_SYMBOL = 'BIGWIG' # future work DESCRI_SYMBOL = 'DESCRIPTION' # followed by data description #################################### def get_key_value(s): lst = s.split('=', 1) k, v = lst[0], lst[1] return (k.strip(), v.strip()) def get_value(s, delimit): lst = s.split(delimit, 1) return lst[1].strip() def make_global_param_dict(fname): f = open(fname) d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change for line in f: line = line.strip() if line.startswith(GLB_PARAM_SYMBOL): s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:] lst = s.split('\t') # separate items by TAB for x in lst: if x != '': k, v = get_key_value(x) d[k] = v if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '': d[k] = v.split() # make a list and rewrite d[k] elif k == 'REBUILD_LIST': d[k] = [] f.close() return d def make_data_dict(fname): ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment''' # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 # f = open(fname) # lines = f.readlines() # f.close() # for line in lines: # line = line.strip() # if line == '' or line.startswith('#'): # continue # if line.startswith(DATA_SYMBOL): # s = line[line.rfind(DATA_SYMBOL[-1])+1:] # s = s.strip() # s is ChIP-seq ID # if s in d: # print('ID %s duplicate' % (s)) # sys.exit() # d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} # d['ID_LIST'].append(s) # if line.startswith('DESCRIPTION:'): # d[s]['DESCRIPTION'] = get_value(line, ':') # elif line.startswith('PROTEN_NAME:'): # d[s]['PROTEN_NAME'] = get_value(line, ':') # elif line.startswith('PROTEIN_ID:'): # d[s]['PROTEIN_ID'] = get_value(line, ':') # elif line.startswith('DATA_NAME:'): # d[s]['DATA_NAME'] = get_value(line, ':') # elif line.startswith('DATA_FORMAT:'): # d[s]['DATA_FORMAT'] = get_value(line, ':') # elif line.startswith('LOCATION:'): # d[s]['LOCATION'] = get_value(line, ':') # elif line.startswith('NOTE:'): # d[s]['NOTE'] = get_value(line, ':') # elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): # make_local_parameter(d[s]['PARAM'], line) # return d # Essentially the same as make_data_dict from get_binding.py d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100 f = open(fname) lines = f.readlines() f.close() for line in lines: line = line.strip() if line == '' or line.startswith('#'): continue if line.startswith(DATA_SYMBOL): s = line[line.rfind(DATA_SYMBOL[-1])+1:] s = s.strip() if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt sys.exit() d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''} if line.startswith('DESCRIPTION:'): d[s]['DESCRIPTION'] = get_value(line, ':') elif line.startswith('PROTEN_NAME:'): d[s]['PROTEN_NAME'] = get_value(line, ':') elif line.startswith('PROTEIN_ID:'): d[s]['PROTEIN_ID'] = get_value(line, ':') elif line.startswith('DATA_NAME:'): d[s]['DATA_NAME'] = get_value(line, ':') elif line.startswith('DATA_FORMAT:'): d[s]['DATA_FORMAT'] = get_value(line, ':') elif line.startswith('LOCATION:'): d[s]['LOCATION'] = get_value(line, ':') if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists. It could happen the location no longer has that file. d['ID_LIST'].append(s) elif line.startswith('NOTE:'): d[s]['NOTE'] = get_value(line, ':') elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL): make_local_parameter(d[s]['PARAM'], line) d['ID_LIST'] = sorted(d['ID_LIST']) return d def make_chip_data(fname): ''' convert each binding column file into a dictionary ''' d = {} if not os.path.exists(fname): print('buildCmatrix: Cannot find file %s' % (fname)) sys.exit() f = open(fname) lines = f.readlines() f.close() for line in lines[1:]: # ignore first line, such as 'gene_id C0001100007100' line = line.strip() lst = line.split() g = lst[0] v = lst[1] d[g] = v return d def get_update_date(s): index = s.find('update:') if index < 0: return '00000000' result = s[s.rfind('update:')+7:].strip() if result.isdigit() and len(result) == 6: return result else: return '00000000' def make_table(gene_file, data_dict, glb_param_dict): ''' Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional) ''' if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all id_lst_all = data_dict['ID_LIST'] else: id_lst_all = glb_param_dict['REBUILD_LIST'] # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field. id_lst = [] for i in id_lst_all: note = data_dict[i]['NOTE'].lower() curr_date = datetime.now().strftime('%Y%m%d') include_this_id = not 'obsolete' in note \ and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored. if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete' id_lst.append(i) if id_lst == []: print('buildCmatrix: ChIP-seq ID list is empty.') sys.exit() # head line of binding.txt id_str = 'gene_id' for myid in id_lst: id_str += '\t' + myid print(id_str) chip_data_dict = {} for myid in id_lst: chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt chip_data = make_chip_data(chip_file) chip_data_dict[myid] = chip_data f = open(gene_file) for line in f: line = line.strip() lst = line.split('\t') gene_id = lst[0] s0 = gene_id for myid in id_lst: if gene_id in chip_data_dict[myid]: s0 += '\t' + chip_data_dict[myid][gene_id] else: s0 += '\t' + '-1' print(s0) f.close() ## main param_file = sys.argv[1] FORCE_INCLUDE_ALL = False if len(sys.argv) > 2: FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all' global_param_dict = make_global_param_dict(param_file) data_dict = make_data_dict(param_file) make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)