# Usage: python create_edges0.py parameter_for_net.txt
#
# Make it faster by spawning subprocesses.
#
# Quickly create edges using all samples in TPM.txt.  TF and targets
# are from target_tf.txt.  Results will be written to
# EDGE_POOL_DIR/edges.txt.simple.correlation.all.conditions.date
# target_tf.txt is produced by make_target_tf.py.
# 
#
# 26 JAN 2017, hui, slcu
# Last modified 5 APR 2017, hui, slcu

import sys, os, operator, itertools
from datetime import datetime
from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
from param4net import make_global_param_dict
from configure import EDGE_POOL_DIR

TARGET_FILE   = '../Data/temp/all_targets.txt'
TF_FILE       = '../Data/temp/all_tfs.txt'
RESULT_FILE   = '../Data/temp/corr_all.txt'
R_SCRIPT_FILE = '../Data/temp/compute_simple_correlation.r'

HISTORY_DIR       = EDGE_POOL_DIR   # edges.txt.* files are here


def get_value(s, delimit):
    lst = s.split(delimit)
    return lst[1].strip()


def get_gene_list(fname):
    result = []
    f = open(fname)
    for line in f:
        line = line.strip()
        lst = line.split()
        result.append(lst[0])
    f.close()
    return result


def make_tf_dict(fname):
    d = {}
    f = open(fname)
    for line in f:
        line = line.strip()
        lst = line.split('\t')
        target = lst[0]
        tf     = lst[1]
        cond   = lst[2].split()
        if not target in d:
            d[target] = {tf:cond}
        else:
            d[target][tf] = cond
    f.close()
    return d


def get_targets_and_tfs(fname):
    f = open(fname)
    target_lst = []
    tf_lst = []
    for line in f:
        line = line.strip()
        lst = line.split('\t')
        target = lst[0]
        tf     = lst[1]
        target_lst.append(target)
        tf_lst.append(tf)
    f.close()
    return sorted(list(set(target_lst))), sorted(list(set(tf_lst)))


def write_lst_to_file(lst, fname):
    f = open(fname, 'w')
    for x in lst:
        f.write(x + '\n')
    f.close()


def make_r_script(fname, result_file, data_file, target_file, tf_file, r_tau=0.75):

    head = 'OUTPUT_FILE   <- \'%s\'\n  DATA_FILE     <- \'%s\'\n    TARGET_FILE   <- \'%s\'\n   TF_FILE       <- \'%s\'\n tau           <- %0.2f\n' % (result_file, data_file, target_file, tf_file, r_tau)

    body = '''
       targets       <- read.table(TARGET_FILE, header=FALSE)
       tfs           <- read.table(TF_FILE, header=FALSE)
       X             <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
       targets       <- as.vector(targets$V1)
       tfs           <- as.vector(tfs$V1)
       all_genes     <- rownames(X)
       
       X           <- as.matrix(X)
       sd.1        <- apply(X, 1, sd) # sd of each row
       s0          <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
       sd.tau      <- (quantile(sd.1,na.rm=TRUE)[1] +  quantile(sd.1,na.rm=TRUE)[2]) / 2.0 # min SD
       good        <- sd.1 > max(sd.tau, 0.05)
       tf_good     <- which( good & (all_genes %in% tfs) == T )
       target_good <- which( good & (all_genes %in% targets) == T )
       
       # compute correlation coefficient
       X <- log(X + 1)
       X[X<0.01] <- NA
       if (length(tf_good) < 2) {
           c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good),]), use='pairwise.complete.obs')
       } else {
           c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
       }
       index <- !is.na(c) & abs(c) >= tau &  abs(c) <= 0.99
       row_names <- rownames(c)
       col_names <- colnames(c)
       result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index])
       
       # write results
       write.table(result, OUTPUT_FILE, col.names=F, row.names=F, sep='\\t', quote=F)
    '''

    f = open(fname, 'w')
    content = head + body
    lst = [x.strip() for x in content.split('\n')]
    f.write('\n'.join(lst))
    f.close()


def edit_headline(fname):
    ''' Remove gene_id from first line.  For easier R matrix reading. '''
    new_fname = fname + '.copy'
    f = open(fname)
    lines = f.readlines()
    f.close()
    f = open(new_fname, 'w')
    head = lines[0].strip()
    head_lst = head.split('\t')[1:]
    num_rnaseq = len(head.split('\t')) - 1
    f.write('\t'.join(head_lst) + '\n')
    for line in lines[1:]:
        f.write(line)
    f.close()
    return new_fname, num_rnaseq


def establish_edges(corr_fname, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict):
    big_tf_dict = make_tf_dict(target_tf_fname)
    f = open(corr_fname)
    lines = f.readlines()
    f.close()

    result = ''
    for line in lines:
        line = line.strip()
        lst = line.split('\t')
        target = lst[0]
        tf     = lst[1]
        score  = '%4.2f' % (float(lst[2]))
        if target in big_tf_dict and tf in big_tf_dict[target]:
            target_str = target + ' ' + get_gene_name(target, agi2name_dict)
            tf_str     = tf     + ' ' + get_gene_name(tf,     agi2name_dict)
            score_str  = score
            cond_str   = ' '.join(big_tf_dict[target][tf])
            curr_date =  datetime.now().strftime('%Y%m%d')
            method_or_tissue = 'all' if glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION'].strip() == '' else glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION']
            s = '\t'.join([target_str, tf_str, score_str, 'all', str(num_rnaseq), cond_str, '.', curr_date, score_str, method_or_tissue])
            result += s + '\n'

    f = open(result_fname, 'w')
    f.write(result)
    f.close()


def target_tf_file_compare_same(fname1, fname2):
    if not os.path.exists(fname1):
        return False
    if not os.path.exists(fname2):
        return False
    f1 = open(fname1)
    s1 = f1.read()
    f1.close()
    f2 = open(fname2)
    s2 = f2.read()
    f2.close()
    return s1 == s2
    

## main
param_file = sys.argv[1] # a single prameter file
glb_param_dict = make_global_param_dict(param_file)
agi2name_dict = make_gene_name_AGI_map_dict(glb_param_dict['GENE_ID_AND_GENE_NAME'])

target_tf_fname = '../Data/information/target_tf.txt'
if not os.path.exists(target_tf_fname):
    print('create_edges0: file %s does not exist.  Produce this file use make_target_tf.py.' % (target_tf_fname))
    sys.exit()
    
all_targets, all_tfs = get_targets_and_tfs(target_tf_fname)
write_lst_to_file(all_targets, TARGET_FILE)
write_lst_to_file(all_tfs, TF_FILE)

data_file, num_rnaseq = edit_headline(glb_param_dict['EXPRESSION_MATRIX'])

make_r_script(R_SCRIPT_FILE, RESULT_FILE, data_file, TARGET_FILE, TF_FILE, 0.60)

cmd = 'Rscript %s' % (R_SCRIPT_FILE)
os.system(cmd) 

if not os.path.isdir(HISTORY_DIR):
    os.makedirs(HISTORY_DIR)

curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
result_fname = os.path.join(HISTORY_DIR, 'edges.txt.simple.correlation.all.conditions.' + curr_time)
establish_edges(RESULT_FILE, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict)  # change

cmd = 'rm -f %s %s %s %s %s' % (data_file, TARGET_FILE, TF_FILE, R_SCRIPT_FILE, RESULT_FILE)
os.system(cmd)
print('Done. Check %s.' % (result_fname))