# Usage: python get_binding.py parameter_for_buildCmatrix.txt
#
#        Manually change CHR_INFO in parameter_for_buildCmatrix.txt if
#        your organism is not Arabidopsis.  Set REBUILD_LIST in
#        parameter_for_buildCmatrix.txt if you only want to make
#        binding column for a few ChIP-seq IDs (useful when adding new
#        ChIP-seq data but not wanting to re-generate existing binding
#        files).
#
#
# Purpose: make individual binding column files, one for each ChIP-seq
#          ID.  These files will be combined by buildCmatrix.py into a
#          big binding matrix, binding.txt.
#
# A typical column file looks like:
#
#   gene_id C0003000001450
#   AT1G01010       0
#   AT1G01020       0
#   AT1G01030       0
#   AT1G01040       0
#   AT1G01046       0
#   AT1G01050       0
#   AT1G01060       0
#   ...
#
# Last modified 5 APR 2017 slcu hui
# Last modified 4 AUG 2019 zjnu hui


import sys, os, operator, bisect
import numpy as np
import pyBigWig
from datetime import datetime


####################################
GLB_PARAM_SYMBOL    = '%%'
LCL_PARAM_SYMBOL    = '%'
DATA_SYMBOL         = '@'
####################################

def get_key_value(s):
    lst = s.split('=')
    k, v = lst[0], lst[1]
    return (k.strip(), v.strip())


def get_value(s, delimit):
    lst = s.split(delimit, 1)
    return lst[1].strip()


def make_chromosome_dict(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    for line in lines:
        line = line.strip()
        if line != '' and not line.startswith('#'):
            lst = line.split('\t')
            k = lst[0]
            v = lst[1]
            d[k] = int(v)
    return d
            

def make_global_param_dict(fname):
    f = open(fname)
    d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change
    for line in f:
        line = line.strip()
        if line.startswith(GLB_PARAM_SYMBOL):
            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
            lst = s.split('\t')  # separate items by TAB
            for x in lst:
                if x != '':
                    k, v = get_key_value(x)
                    if k == 'REBUILD_LIST' and v.lower() != 'all':
                        d[k] = v.split() # make a list and rewrite d[k]
                    elif k == 'CHR_INFO' and os.path.exists(v):
                        d[k] = make_chromosome_dict(v)
                    else:
                        d[k] = v
                        
    if len(d['CHR_INFO']) == 0:
        print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.')
        sys.exit()
        
    f.close()
    return d


# for each dataset, make a column gene_id, sample_id
def make_chip_data(fname):
    ''' fname - a narrowPeak file
    given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength). 
    '''

    d = {}
    if not os.path.exists(fname):
        print('get_binding: cannot find file %s' % (fname))
        sys.exit()

    f = open(fname)
    lines = f.readlines()
    f.close()
    if not check_valid_peak_line(lines[0]): # very basic check
        print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname))
        return d
    for line in lines:
        line = line.strip()
        lst = line.split()
        c = lst[0]

        rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it.
        if  rindex != -1:
            c = c[rindex+1:].strip()
            
        ss = int(lst[1])
        ee = int(lst[2])
        strength = lst[4] # the 5th column is used as strength
        if not c in d:
            d[c] = [(ss, ee, strength)]
        else:
            d[c].append((ss, ee, strength))

    for k in d:
        d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
    return d


def make_data_dict(fname):
    ''' fname is paramter_for_buildCmatrix.txt '''

    d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        if line.startswith(DATA_SYMBOL):
            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
            s = s.strip()
            if s in d:
                print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s))
                sys.exit()
            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
        if line.startswith('DESCRIPTION:'):
            d[s]['DESCRIPTION'] = get_value(line, ':')
        elif line.startswith('PROTEN_NAME:'):
            d[s]['PROTEN_NAME'] = get_value(line, ':')
        elif line.startswith('PROTEIN_ID:'):
            d[s]['PROTEIN_ID'] = get_value(line, ':')
        elif line.startswith('DATA_NAME:'):
            d[s]['DATA_NAME'] = get_value(line, ':')                        
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_FORMAT'] = get_value(line, ':')
        elif line.startswith('LOCATION:'):
            d[s]['LOCATION'] = get_value(line, ':')
            if os.path.exists(d[s]['LOCATION']):
                d['ID_LIST'].append(s)
            else:
                print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s.  Ignore this ID.' % (d[s]['LOCATION'], s))
        elif line.startswith('NOTE:'):
            d[s]['NOTE'] = get_value(line, ':')
        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
            make_local_parameter(d[s]['PARAM'], line)

    return d


def get_interval(c, s, e, o, flank, chr_info_dict):
    '''
    o --- +  positive strand, don't include gene body
          ++ positive strand, include gene body
          -  negative strand, don't include gene body
          -- negative strand, include gene body
          *  include both sides and gene body
    '''

    if o == '+':
        return (np.max([0, s-flank]), s)
    elif o == '++': # also include gene body
        return (np.max([0, s-flank]), e)
    elif o == '-':
        return (e, np.min([e+flank, chr_info_dict[c]]))
    elif o == '--':
        return (s, np.min([e+flank, chr_info_dict[c]]))
    elif o == '*': # both sides and gene body
        return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) )
    return (-1, -1) # should not be here


def check_valid_peak_line(s):
    s = s.strip()
    lst = s.split()
    if lst[0].isdigit() or lst[0].lower().startswith('chr'):
        return True
    return False


def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
    ''' check with a given interval has intersection with peaks from a ChIP-seq '''
    if len(chip_dict) == 0 or not chromosome in chip_dict:
        return []

    result = []
    lst = chip_dict[chromosome] # get a list of intervals in that chromosome
    n = len(lst)
    slst, elst, strength_lst = zip(*lst) # make three sub-lists
    index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position
    index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position
    sublst = lst[index1:index2] # these intervals potentially have intersections with given interval
    #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
    for t in sublst:
        ss = t[0]
        ee = t[1]
        strength = t[2]
        if start_pos <= ee and end_pos >= ss:
            result.append(int(strength))
            #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
    return result


def make_o(c, glb_param_dict):
    ''' make orientation '''
    ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body
    both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES'
    if both_side:
        return '*'
    if ig:
        return 2*c # also include gene body
    return c
    

def calc_intensity(t, bw, norm_by_gene_length=0):
    '''
    t - a tuple, chromosome, start and end position
    '''
    chromosome = t[0]
    start = t[1]
    end = t[2]
    if end < start:
        print('get_binding.py calc_intensity: start position must be less than end position')
        sys.exit()

    I = bw.intervals(chromosome, start, end) # a list of intervals
    
    sum_area = 0.0
    if I: # is I is not empty
        for t in I:
            s0 = max(start, t[0])
            e0 = min(end, t[1])
            h = t[2]
            #print('=== %d %d %g' % (s0, e0, h))
            sum_area += h * (e0 - s0)

    if norm_by_gene_length != 0:
        return 1000.0 * sum_area / (end - start)
    else:
        return 1.0 * sum_area

    
def get_update_date(s):
    index = s.find('update:')
    if index < 0:
        return None
    result = s[s.rfind('update:')+7:].strip()
    if result.isdigit() and len(result) == 6:
        return result
    else:
        return '00000000'


def make_table(gene_file, data_dict, glb_param_dict):
    ''' 
    Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional).
    '''

    if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all
        id_lst = data_dict['ID_LIST']
    else:
        id_lst = glb_param_dict['REBUILD_LIST']
        for c in id_lst:
            if not c in data_dict['ID_LIST']:
                print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c))
                sys.exit()
    
    chip_data_dict = {}
    for myid in id_lst:
        chip_file = data_dict[myid]['LOCATION']

        if not os.path.exists(chip_file):
            print('get_binding: file %s dose not exists.' % (chip_file))
            sys.exit()
        
        if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
            chip_data = make_chip_data(chip_file)
            
        elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
            chip_data = pyBigWig.open(chip_file)

        else:
            print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT']))
            sys.exit()
            
        chip_data_dict[myid] = chip_data    
    

    f = open(gene_file)
    lines = f.readlines() # lines contain all lines in gene_file.txt
    f.close()

    dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here
    if not os.path.isdir(dest):
        os.makedirs(dest)
    
    for myid in id_lst:  # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt
        #print('Processing %s' % (myid))
        fname = os.path.join(dest, myid + '.txt')
        if os.path.exists(fname): # this file has been created before.  Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date.
            file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d')
            file_update_time = get_update_date(data_dict[myid]['NOTE'])
            if file_update_time == None:
                continue
            if file_update_time <= file_modification_time:
                continue
            
        f = open(fname, 'w')
        content = 'gene_id\t%s\n' % (myid)

        for line in lines:
            line = line.strip()
            lst = line.split('\t')
            gene_id   = lst[0]
            gene_name = lst[1]
            c         = lst[2] # chromosome, a letter, e.g., '1', '2', etc.
            s         = int(lst[3])
            e         = int(lst[4])
            o         = make_o(lst[5], glb_param_dict) # gene oritentation, + or -

            flank = int(glb_param_dict['TARGET_RANGE'])
            interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream

            s0 = gene_id

            if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
                intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid])
                if intersection_list != []:
                    s0 += '\t'  + str(np.max(intersection_list)) # get the maximum value in the interval [change]
                else:
                    s0 += '\t' + '0'

            if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
                t = (c, interval[0], interval[1])
                try:
                    z = calc_intensity(t, chip_data_dict[myid], 1)
                except RuntimeError:
                    z = -1.0
                s0 += '\t'  + '%4.2f' % (z)

            content += s0 + '\n'
        
        f.write(content)
        f.close()
        
    # close bw files
    for myid in id_lst:
        if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
            chip_data_dict[myid].close()


## main
param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt
global_param_dict = make_global_param_dict(param_file)
data_dict = make_data_dict(param_file)
print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.'
      % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST'])))
make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)