# Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
#        Watch out NA values in TPM.txt, these genes don't have any gene expression information.
#
# Purpose: make a TPM table, where each row is a gene, and each column is an experiment.  The column name is RNA-seq experiment ID.
#
# 23 Dec 2016, hui, slcu
# Last modified  5 Apr 2017, hui, slcu
# Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.]
# Last modified 10 Oct 2020, hui, zjnu [note that if there are more than 1000 RNA-seq samples, this script requires at least 7GB memory to run.]

import os, sys, glob
from configure import TPM_FILE
from utils import write_first_column, append_column_fast

WARN_NA = False

####################################
GLB_PARAM_SYMBOL    = '%%'
LCL_PARAM_SYMBOL    = '%'
DATA_SYMBOL         = '@'
####################################

def common_part(s):
    ''' s is expected to have this form: AT1G01020.1, remove .1 '''
    s = s.strip()
    index = s.find('.')
    if index < 0: # not found, -1
        return s
    return s[0:index]


def make_expression_dict(fname, myid):
    '''
    fname -- salmon file 
    myid -- RNA-seq experiment ID

    The retured value is a dictionary which looks like

    {
      'ID': RNA-seq experiment ID
      'isoform': 
         {
           'AT1G12345': [],
           'AT2G12345': [],
           ...
         }
    }

    Each gene ID (e.g., AT1G12345) has a number of isoforms which gives different expression levels.
    '''
    
    ID_COL  = 0 # Salmon's quant.sf file, first column is gene ID
    TPM_COL = 3 # Salmon's quant.sf file, fourth column is TPM

    if not os.path.exists(fname):
        print('ERROR [buildRmatrix.py]: file %s not exists.' % (fname))
        sys.exit()
        
    d = {'ID':myid, 'isoform':{}}

    f = open(fname)
    lines = f.readlines()
    f.close()    
    for line in lines[1:]: # ignore head line, Name Length EffectiveLength TPM NumReads
        line = line.strip()
        lst = line.split()
        gene_id = lst[ID_COL]
        tpm = float(lst[TPM_COL])
        common = common_part(gene_id) # gene id without .1, .2, etc.
        if not common in d['isoform']:
            d['isoform'][common] = [tpm]
        else:
            d['isoform'][common].append(tpm)

    # make the dictionary smaller by using a string instead of a double-precision float number, so it requires less memory.  Cut from 7.44G to 6.5G for 1003 TPM files.
    for g in d['isoform']:
        d['isoform'][g] = '%s' % get_max_expressed_isoform_save_space(g, d)

    return d


def get_max_expressed_isoform_save_space(g, d):
    ''' Evolved from get_max_expressed_isoform(g, d) '''
    if not g in d['isoform']:
        return '-9'
    lst = d['isoform'][g]
    return '%4.2f' % max(lst)


def make_gene_expression_list(gene_lst, d):
    '''
    gene_lst: a list of genes
    dict: each dictionary contains gene expression inforamtion.
    '''
    result = []
    bad_count = 0   # number of NA gene expression levels.  We wish this number to be far smaller than total_count.

    for g in gene_lst:
        v = 'NA'
        if g in d['isoform']:
            v = d['isoform'][g]
        else:
            bad_count += 1
            if WARN_NA:
                print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
        result.append(v)

    total_count = len(gene_lst)
    if 1.0 * bad_count / total_count > 0.0 and WARN_NA: 
        print('WARNING [buildRmatrix.py]: %s contains NA values! \n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (d['ID'], bad_count, total_count, 100.0* bad_count/total_count, bad_count))
    return result


def get_gene_list(fname):
    f = open(fname)
    lst = []
    for line in f:
        line = line.strip()
        if line != '':
            l = line.split()[0]
            lst.append(l)
    f.close()
    return lst


def get_key_value(s):
    lst = s.split('=')
    k, v = lst[0], lst[1]
    return (k, v)


def get_value(s, delimit):
    index = s.find(delimit)
    if index < 0:
        sys.exit()
    return s[index+1:].strip()


def make_data_dict(fname):
    '''
    fname - parameter_for_buildRmatrix.txt

    Return a dictionary which looks like
    
    {
      'RNASEQ_ID_LIST': [],
      'SRR1':
        {
          'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
        }
    }

    '''
    d = {'RNASEQ_ID_LIST':[]} # RNASEQ_ID_LIST is a list of RNA-seq experiment IDs
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        if line.startswith(DATA_SYMBOL):
            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
            s = s.strip()
            if s in d:
                print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
                sys.exit()
            d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
            d['RNASEQ_ID_LIST'].append(s)
        if line.startswith('DESCRIPTION:'):
            d[s]['DESCRIPTION'] = get_value(line, ':')
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_NAME'] = get_value(line, ':')            
        elif line.startswith('DATA_FORMAT:'):
            d[s]['DATA_FORMAT'] = get_value(line, ':')
        elif line.startswith('LOCATION:'):
            d[s]['LOCATION'] = get_value(line, ':')
        elif line.startswith('NOTE:'):
            d[s]['NOTE'] = get_value(line, ':')
        elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
            make_local_parameter(d[s]['PARAM'], line)

    return d


def make_global_param_dict(fname):
    f = open(fname)
    d = {'GENE_LIST':''} # change
    for line in f:
        line = line.strip()
        if line.startswith(GLB_PARAM_SYMBOL):
            s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
            lst = s.split('\t')  # separate items by TAB
            for x in lst:
                if x != '':
                    k, v = get_key_value(x)
                    d[k] = v
    f.close()
    return d



## main
param_file = sys.argv[1]
global_param_dict = make_global_param_dict(param_file)
gene_lst = get_gene_list(global_param_dict['GENE_LIST'])
data_dict = make_data_dict(param_file)
TPM_FILE = os.path.abspath(TPM_FILE)
write_first_column(['gene_id'] + gene_lst, TPM_FILE)
num_rnaseq = len(data_dict['RNASEQ_ID_LIST'])
print('[buildRmatrix.py] Total RNA-seq data %d' % num_rnaseq)
total = 0
for experiment_id in data_dict['RNASEQ_ID_LIST']:
    total += 1
    print('\rNow appending %s (%4.2f%% finished)' % (experiment_id, 100*total/num_rnaseq), end='')
    d = make_expression_dict(data_dict[experiment_id]['LOCATION'], experiment_id)
    gene_expression_lst = make_gene_expression_list(gene_lst, d)
    result = append_column_fast(TPM_FILE, [experiment_id] + gene_expression_lst)