# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt
#
# Purpose: combine all binding columns into a matrix, binding.txt.
# Each ChIP-seq experiment has the following binding column file in
# the following format
#
#   gene_id C0003000001450
#   AT1G01010       0
#   AT1G01020       0
#   AT1G01030       0
#   AT1G01040       0
#   AT1G01046       0
#   AT1G01050       0
#   AT1G01060       0
#   ...
#
# These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt
# Created on 3 JAN 2017  hui  SLCU
# Last modified 5 APR 2017 SLCU hui
# Last modified 6 Aug 2019 Hui Lan <lanhui@zjnu.edu.cn>  [now accept a second command-line argument 'include-all']

import sys, os
from datetime import datetime

####################################
GLB_PARAM_SYMBOL    = '%%'
LCL_PARAM_SYMBOL    = '%'
DATA_SYMBOL         = '@'               # followed by data name, or condition name, should be unique
CHIP_SYMBOL         = 'ChIP'            # followed by ChIP peak file path
CHIP_GENE_SYMBOL    = 'ChIP_GENE'       # followed by ChIP protein name (in upper case, gene id or gene name)
RNA_SYMBOL          = 'RNA'             # followed by RNA data, a TPM table, first column is gene id, second column is TPM value
BIGWIG_SYMBOL       = 'BIGWIG'          # future work
DESCRI_SYMBOL       = 'DESCRIPTION'     # followed by data description
####################################


def get_key_value(s):
    lst = s.split('=', 1)
    k, v = lst[0], lst[1]
    return (k.strip(), v.strip())


def get_value(s, delimit):
    lst = s.split(delimit, 1)
    return lst[1].strip()


def make_global_param_dict(fname):
    f = open(fname)
    d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change
    for line in f:
        line = line.strip()
        if line.startswith(GLB_PARAM_SYMBOL):
            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
            lst = s.split('\t')  # separate items by TAB
            for x in lst:
                if x != '':
                    k, v = get_key_value(x)
                    d[k] = v
                    if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '':
                        d[k] = v.split() # make a list and rewrite d[k]
                    elif k == 'REBUILD_LIST':
                        d[k] = []
    f.close()
    return d


def make_data_dict(fname):
    ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment'''

    # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
    # f = open(fname)
    # lines = f.readlines()
    # f.close()
    # for line in lines:
    #     line = line.strip()
    #     if line == '' or line.startswith('#'):
    #         continue
    #     if line.startswith(DATA_SYMBOL):
    #         s = line[line.rfind(DATA_SYMBOL[-1])+1:]
    #         s = s.strip() # s is ChIP-seq ID
    #         if s in d:
    #             print('ID %s duplicate' % (s))
    #             sys.exit()
    #         d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
    #         d['ID_LIST'].append(s)
    #     if line.startswith('DESCRIPTION:'):
    #         d[s]['DESCRIPTION'] = get_value(line, ':')
    #     elif line.startswith('PROTEN_NAME:'):
    #         d[s]['PROTEN_NAME'] = get_value(line, ':')
    #     elif line.startswith('PROTEIN_ID:'):
    #         d[s]['PROTEIN_ID'] = get_value(line, ':')
    #     elif line.startswith('DATA_NAME:'):
    #         d[s]['DATA_NAME'] = get_value(line, ':')                        
    #     elif line.startswith('DATA_FORMAT:'):
    #         d[s]['DATA_FORMAT'] = get_value(line, ':')
    #     elif line.startswith('LOCATION:'):
    #         d[s]['LOCATION'] = get_value(line, ':')
    #     elif line.startswith('NOTE:'):
    #         d[s]['NOTE'] = get_value(line, ':')
    #     elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
    #         make_local_parameter(d[s]['PARAM'], line)

    # return d

    # Essentially the same as make_data_dict from get_binding.py
    d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        if line.startswith(DATA_SYMBOL):
            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
            s = s.strip()
            if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt
                sys.exit()
            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
        if line.startswith('DESCRIPTION:'):
            d[s]['DESCRIPTION'] = get_value(line, ':')
        elif line.startswith('PROTEN_NAME:'):
            d[s]['PROTEN_NAME'] = get_value(line, ':')
        elif line.startswith('PROTEIN_ID:'):
            d[s]['PROTEIN_ID'] = get_value(line, ':')
        elif line.startswith('DATA_NAME:'):
            d[s]['DATA_NAME'] = get_value(line, ':')                        
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_FORMAT'] = get_value(line, ':')
        elif line.startswith('LOCATION:'):
            d[s]['LOCATION'] = get_value(line, ':')
            if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists.  It could happen the location no longer has that file.
                d['ID_LIST'].append(s)
        elif line.startswith('NOTE:'):
            d[s]['NOTE'] = get_value(line, ':')
        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
            make_local_parameter(d[s]['PARAM'], line)

    d['ID_LIST'] = sorted(d['ID_LIST'])
    return d


def make_chip_data(fname):
    ''' convert each binding column file into a dictionary '''
    d = {}
    if not os.path.exists(fname):
        print('buildCmatrix: Cannot find file %s' % (fname))
        sys.exit()

    f = open(fname)
    lines = f.readlines()
    f.close()

    for line in lines[1:]: # ignore first line, such as 'gene_id    C0001100007100'
        line = line.strip()
        lst = line.split()
        g = lst[0]
        v = lst[1]
        d[g] = v
        
    return d


def get_update_date(s):
    index = s.find('update:')
    if index < 0:
        return '00000000'
    result = s[s.rfind('update:')+7:].strip()
    if result.isdigit() and len(result) == 6:
        return result
    else:
        return '00000000'

def make_table(gene_file, data_dict, glb_param_dict):
    ''' 
    Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional)
    '''

    if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all
        id_lst_all = data_dict['ID_LIST']
    else:
        id_lst_all = glb_param_dict['REBUILD_LIST']

    # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field.
    id_lst = []
    for i in id_lst_all:
        note = data_dict[i]['NOTE'].lower()
        curr_date =  datetime.now().strftime('%Y%m%d')
        include_this_id = not 'obsolete' in note \
                          and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored.
        if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete'
            id_lst.append(i)
            
    if id_lst == []:
        print('buildCmatrix: ChIP-seq ID list is empty.')
        sys.exit()

    # head line of binding.txt
    id_str = 'gene_id'  
    for myid in id_lst:
        id_str += '\t' + myid
    print(id_str)

    chip_data_dict = {}
    for myid in id_lst:
        chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt
        chip_data = make_chip_data(chip_file)
        chip_data_dict[myid] = chip_data    

    f = open(gene_file)
    for line in f:
        line = line.strip()
        lst = line.split('\t')
        gene_id = lst[0]
        s0 = gene_id
        for myid in id_lst:
            if gene_id in chip_data_dict[myid]:
                s0 += '\t' + chip_data_dict[myid][gene_id]
            else:
                s0 += '\t' + '-1'
        print(s0)
    f.close()



## main
param_file = sys.argv[1]
FORCE_INCLUDE_ALL = False
if len(sys.argv) > 2:
    FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all'
global_param_dict = make_global_param_dict(param_file)
data_dict = make_data_dict(param_file)
make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)