brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/buildCmatrix.py
1 files changed, 234 insertions, 0 deletions
diff --git a/Code/buildCmatrix.py b/Code/buildCmatrix.py
new file mode 100644
index 0000000..5cbd30c
--- /dev/null
+++ b/Code/buildCmatrix.py
@@ -0,0 +1,234 @@
+# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt
+#
+# Purpose: combine all binding columns into a matrix, binding.txt.
+# Each ChIP-seq experiment has the following binding column file in
+# the following format
+#
+#   gene_id C0003000001450
+#   AT1G01010       0
+#   AT1G01020       0
+#   AT1G01030       0
+#   AT1G01040       0
+#   AT1G01046       0
+#   AT1G01050       0
+#   AT1G01060       0
+#   ...
+#
+# These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt
+# Created on 3 JAN 2017  hui  SLCU
+# Last modified 5 APR 2017 SLCU hui
+# Last modified 6 Aug 2019 Hui Lan <lanhui@zjnu.edu.cn>  [now accept a second command-line argument 'include-all']
+
+import sys, os
+from datetime import datetime
+
+####################################
+GLB_PARAM_SYMBOL    = '%%'
+LCL_PARAM_SYMBOL    = '%'
+DATA_SYMBOL         = '@'               # followed by data name, or condition name, should be unique
+CHIP_SYMBOL         = 'ChIP'            # followed by ChIP peak file path
+CHIP_GENE_SYMBOL    = 'ChIP_GENE'       # followed by ChIP protein name (in upper case, gene id or gene name)
+RNA_SYMBOL          = 'RNA'             # followed by RNA data, a TPM table, first column is gene id, second column is TPM value
+BIGWIG_SYMBOL       = 'BIGWIG'          # future work
+DESCRI_SYMBOL       = 'DESCRIPTION'     # followed by data description
+####################################
+
+
+def get_key_value(s):
+    lst = s.split('=', 1)
+    k, v = lst[0], lst[1]
+    return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+    lst = s.split(delimit, 1)
+    return lst[1].strip()
+
+
+def make_global_param_dict(fname):
+    f = open(fname)
+    d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+    for line in f:
+        line = line.strip()
+        if line.startswith(GLB_PARAM_SYMBOL):
+            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+            lst = s.split('\t')  # separate items by TAB
+            for x in lst:
+                if x != '':
+                    k, v = get_key_value(x)
+                    d[k] = v
+                    if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '':
+                        d[k] = v.split() # make a list and rewrite d[k]
+                    elif k == 'REBUILD_LIST':
+                        d[k] = []
+    f.close()
+    return d
+
+
+def make_data_dict(fname):
+    ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment'''
+
+    # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+    # f = open(fname)
+    # lines = f.readlines()
+    # f.close()
+    # for line in lines:
+    #     line = line.strip()
+    #     if line == '' or line.startswith('#'):
+    #         continue
+    #     if line.startswith(DATA_SYMBOL):
+    #         s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+    #         s = s.strip() # s is ChIP-seq ID
+    #         if s in d:
+    #             print('ID %s duplicate' % (s))
+    #             sys.exit()
+    #         d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+    #         d['ID_LIST'].append(s)
+    #     if line.startswith('DESCRIPTION:'):
+    #         d[s]['DESCRIPTION'] = get_value(line, ':')
+    #     elif line.startswith('PROTEN_NAME:'):
+    #         d[s]['PROTEN_NAME'] = get_value(line, ':')
+    #     elif line.startswith('PROTEIN_ID:'):
+    #         d[s]['PROTEIN_ID'] = get_value(line, ':')
+    #     elif line.startswith('DATA_NAME:'):
+    #         d[s]['DATA_NAME'] = get_value(line, ':')                        
+    #     elif line.startswith('DATA_FORMAT:'):
+    #         d[s]['DATA_FORMAT'] = get_value(line, ':')
+    #     elif line.startswith('LOCATION:'):
+    #         d[s]['LOCATION'] = get_value(line, ':')
+    #     elif line.startswith('NOTE:'):
+    #         d[s]['NOTE'] = get_value(line, ':')
+    #     elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+    #         make_local_parameter(d[s]['PARAM'], line)
+
+    # return d
+
+    # Essentially the same as make_data_dict from get_binding.py
+    d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            continue
+        if line.startswith(DATA_SYMBOL):
+            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+            s = s.strip()
+            if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt
+                sys.exit()
+            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+        if line.startswith('DESCRIPTION:'):
+            d[s]['DESCRIPTION'] = get_value(line, ':')
+        elif line.startswith('PROTEN_NAME:'):
+            d[s]['PROTEN_NAME'] = get_value(line, ':')
+        elif line.startswith('PROTEIN_ID:'):
+            d[s]['PROTEIN_ID'] = get_value(line, ':')
+        elif line.startswith('DATA_NAME:'):
+            d[s]['DATA_NAME'] = get_value(line, ':')                        
+        elif line.startswith('DATA_FORMAT:'):
+            d[s]['DATA_FORMAT'] = get_value(line, ':')
+        elif line.startswith('LOCATION:'):
+            d[s]['LOCATION'] = get_value(line, ':')
+            if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists.  It could happen the location no longer has that file.
+                d['ID_LIST'].append(s)
+        elif line.startswith('NOTE:'):
+            d[s]['NOTE'] = get_value(line, ':')
+        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+            make_local_parameter(d[s]['PARAM'], line)
+
+    d['ID_LIST'] = sorted(d['ID_LIST'])
+    return d
+
+
+def make_chip_data(fname):
+    ''' convert each binding column file into a dictionary '''
+    d = {}
+    if not os.path.exists(fname):
+        print('buildCmatrix: Cannot find file %s' % (fname))
+        sys.exit()
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+
+    for line in lines[1:]: # ignore first line, such as 'gene_id    C0001100007100'
+        line = line.strip()
+        lst = line.split()
+        g = lst[0]
+        v = lst[1]
+        d[g] = v
+        
+    return d
+
+
+def get_update_date(s):
+    index = s.find('update:')
+    if index < 0:
+        return '00000000'
+    result = s[s.rfind('update:')+7:].strip()
+    if result.isdigit() and len(result) == 6:
+        return result
+    else:
+        return '00000000'
+
+def make_table(gene_file, data_dict, glb_param_dict):
+    ''' 
+    Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional)
+    '''
+
+    if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all
+        id_lst_all = data_dict['ID_LIST']
+    else:
+        id_lst_all = glb_param_dict['REBUILD_LIST']
+
+    # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field.
+    id_lst = []
+    for i in id_lst_all:
+        note = data_dict[i]['NOTE'].lower()
+        curr_date =  datetime.now().strftime('%Y%m%d')
+        include_this_id = not 'obsolete' in note \
+                          and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored.
+        if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete'
+            id_lst.append(i)
+            
+    if id_lst == []:
+        print('buildCmatrix: ChIP-seq ID list is empty.')
+        sys.exit()
+
+    # head line of binding.txt
+    id_str = 'gene_id'  
+    for myid in id_lst:
+        id_str += '\t' + myid
+    print(id_str)
+
+    chip_data_dict = {}
+    for myid in id_lst:
+        chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt
+        chip_data = make_chip_data(chip_file)
+        chip_data_dict[myid] = chip_data    
+
+    f = open(gene_file)
+    for line in f:
+        line = line.strip()
+        lst = line.split('\t')
+        gene_id = lst[0]
+        s0 = gene_id
+        for myid in id_lst:
+            if gene_id in chip_data_dict[myid]:
+                s0 += '\t' + chip_data_dict[myid][gene_id]
+            else:
+                s0 += '\t' + '-1'
+        print(s0)
+    f.close()
+
+
+
+## main
+param_file = sys.argv[1]
+FORCE_INCLUDE_ALL = False
+if len(sys.argv) > 2:
+    FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all'
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/buildCmatrix.py