diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2019-12-04 19:03:19 +0800 |
commit | 97fdefab064f63642fa3ece05b807d29b459df31 (patch) | |
tree | a058530023224f3e35b1783996f3530c80c04bc5 /Code/buildCmatrix.py |
brain: add python and R code to local repository.
Diffstat (limited to 'Code/buildCmatrix.py')
-rw-r--r-- | Code/buildCmatrix.py | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/Code/buildCmatrix.py b/Code/buildCmatrix.py new file mode 100644 index 0000000..5cbd30c --- /dev/null +++ b/Code/buildCmatrix.py @@ -0,0 +1,234 @@ +# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt
+#
+# Purpose: combine all binding columns into a matrix, binding.txt.
+# Each ChIP-seq experiment has the following binding column file in
+# the following format
+#
+# gene_id C0003000001450
+# AT1G01010 0
+# AT1G01020 0
+# AT1G01030 0
+# AT1G01040 0
+# AT1G01046 0
+# AT1G01050 0
+# AT1G01060 0
+# ...
+#
+# These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt
+# Created on 3 JAN 2017 hui SLCU
+# Last modified 5 APR 2017 SLCU hui
+# Last modified 6 Aug 2019 Hui Lan <lanhui@zjnu.edu.cn> [now accept a second command-line argument 'include-all']
+
+import sys, os
+from datetime import datetime
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@' # followed by data name, or condition name, should be unique
+CHIP_SYMBOL = 'ChIP' # followed by ChIP peak file path
+CHIP_GENE_SYMBOL = 'ChIP_GENE' # followed by ChIP protein name (in upper case, gene id or gene name)
+RNA_SYMBOL = 'RNA' # followed by RNA data, a TPM table, first column is gene id, second column is TPM value
+BIGWIG_SYMBOL = 'BIGWIG' # future work
+DESCRI_SYMBOL = 'DESCRIPTION' # followed by data description
+####################################
+
+
+def get_key_value(s):
+ lst = s.split('=', 1)
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '':
+ d[k] = v.split() # make a list and rewrite d[k]
+ elif k == 'REBUILD_LIST':
+ d[k] = []
+ f.close()
+ return d
+
+
+def make_data_dict(fname):
+ ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment'''
+
+ # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ # f = open(fname)
+ # lines = f.readlines()
+ # f.close()
+ # for line in lines:
+ # line = line.strip()
+ # if line == '' or line.startswith('#'):
+ # continue
+ # if line.startswith(DATA_SYMBOL):
+ # s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ # s = s.strip() # s is ChIP-seq ID
+ # if s in d:
+ # print('ID %s duplicate' % (s))
+ # sys.exit()
+ # d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ # d['ID_LIST'].append(s)
+ # if line.startswith('DESCRIPTION:'):
+ # d[s]['DESCRIPTION'] = get_value(line, ':')
+ # elif line.startswith('PROTEN_NAME:'):
+ # d[s]['PROTEN_NAME'] = get_value(line, ':')
+ # elif line.startswith('PROTEIN_ID:'):
+ # d[s]['PROTEIN_ID'] = get_value(line, ':')
+ # elif line.startswith('DATA_NAME:'):
+ # d[s]['DATA_NAME'] = get_value(line, ':')
+ # elif line.startswith('DATA_FORMAT:'):
+ # d[s]['DATA_FORMAT'] = get_value(line, ':')
+ # elif line.startswith('LOCATION:'):
+ # d[s]['LOCATION'] = get_value(line, ':')
+ # elif line.startswith('NOTE:'):
+ # d[s]['NOTE'] = get_value(line, ':')
+ # elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ # make_local_parameter(d[s]['PARAM'], line)
+
+ # return d
+
+ # Essentially the same as make_data_dict from get_binding.py
+ d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists. It could happen the location no longer has that file.
+ d['ID_LIST'].append(s)
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ d['ID_LIST'] = sorted(d['ID_LIST'])
+ return d
+
+
+def make_chip_data(fname):
+ ''' convert each binding column file into a dictionary '''
+ d = {}
+ if not os.path.exists(fname):
+ print('buildCmatrix: Cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ for line in lines[1:]: # ignore first line, such as 'gene_id C0001100007100'
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ v = lst[1]
+ d[g] = v
+
+ return d
+
+
+def get_update_date(s):
+ index = s.find('update:')
+ if index < 0:
+ return '00000000'
+ result = s[s.rfind('update:')+7:].strip()
+ if result.isdigit() and len(result) == 6:
+ return result
+ else:
+ return '00000000'
+
+def make_table(gene_file, data_dict, glb_param_dict):
+ '''
+ Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional)
+ '''
+
+ if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all
+ id_lst_all = data_dict['ID_LIST']
+ else:
+ id_lst_all = glb_param_dict['REBUILD_LIST']
+
+ # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field.
+ id_lst = []
+ for i in id_lst_all:
+ note = data_dict[i]['NOTE'].lower()
+ curr_date = datetime.now().strftime('%Y%m%d')
+ include_this_id = not 'obsolete' in note \
+ and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored.
+ if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete'
+ id_lst.append(i)
+
+ if id_lst == []:
+ print('buildCmatrix: ChIP-seq ID list is empty.')
+ sys.exit()
+
+ # head line of binding.txt
+ id_str = 'gene_id'
+ for myid in id_lst:
+ id_str += '\t' + myid
+ print(id_str)
+
+ chip_data_dict = {}
+ for myid in id_lst:
+ chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt
+ chip_data = make_chip_data(chip_file)
+ chip_data_dict[myid] = chip_data
+
+ f = open(gene_file)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ gene_id = lst[0]
+ s0 = gene_id
+ for myid in id_lst:
+ if gene_id in chip_data_dict[myid]:
+ s0 += '\t' + chip_data_dict[myid][gene_id]
+ else:
+ s0 += '\t' + '-1'
+ print(s0)
+ f.close()
+
+
+
+## main
+param_file = sys.argv[1]
+FORCE_INCLUDE_ALL = False
+if len(sys.argv) > 2:
+ FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all'
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)
|