# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
#        Edit QUANT_PATH, set NON_ZERO_RATIO.
#
# Purpose: automatically generate parameter_for_buildRmatrix.txt
#
# Update: 26 Feb 2017, slcu, hui
# Update: 19 Sep 2019, slcu, hui [add  read_ena_data_info_json]

import sys, os, glob, json
import fnmatch, re
from datetime import datetime
from log import write_log_file
from configure import RNA_SEQ_INFO_FILE, UPDATE_NETWORK_LOG_FILE

NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other']  # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.

def get_quant_file_list(fname):
    f = open(fname)
    result = []
    for line in f:
        line = line.strip()
        lst = line.split()
        result.append(lst[0])
    f.close()
    return result

def extract_id(s, src):
    if src == 'inhouse' or src == 'other':
        dirname = os.path.dirname(s)
        lst = dirname.split('/')
        parent_dir = lst[-1]
        libno = filter(str.isdigit, parent_dir) # extrac library number
        libno = libno.zfill(3)
        sample_no = os.path.basename(s) # extract sample number
        first_match = re.findall('_S\d+_', sample_no)[0]
        sample_no = filter(str.isdigit, first_match).zfill(2)
        return '0000' + libno + sample_no
    if src == 'sra':
        index = s.find('_quant')
        if index > 0:
            return s[:index]
        else:
            return 'NA'
    return 'NA'

def zfill2(s, n):
    return s + 'X' * (n-len(s))
    
def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
    if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
        result = 'R' + success_flag + lab_id + zfill2(myid, 9)
    else: # inhouse or other
        result = 'R' + success_flag + lab_id + myid
    return result
    
def glob_files_include_path(directory, pattern):
    ''' return all file names (with paths) given directory and pattern '''
    result = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            result.append(os.path.join(root, filename))
    return result

def glob_files_include_paths(directory_lst, pattern):
    ''' return all file names (with paths) given directory and pattern '''
    result = []
    for directory in directory_lst:
        result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
    return result

def non_zero_ratio(fname):
    non_zero_count = 0
    total_count = 0
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines[1:]:
        line = line.strip()
        lst = line.split()
        if len(lst) < 4: # this should not occur. Report error if occurred.
            return -1
        tpm = lst[3]
        if not tpm == '0' and not 'nan' in tpm:
            non_zero_count += 1
        total_count += 1
    return 1.0 * non_zero_count / total_count

def get_rna_word_count(s):
    ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
    count = 0
    s = s.lower()
    if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
        count = 1
    return count

def read_ena_data_info_json(fname):
    d = {}
    with open(fname) as json_data:
        json_dict = json.load(json_data)
        for run_id in json_dict:
           d[run_id] = 1
    return d


### main
if not os.path.exists(RNA_SEQ_INFO_FILE):
    print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
    sys.exit()
    
rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)


print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head

if len(sys.argv) > 1: # only get these samples specified in id-list.txt
    quant_files = get_quant_file_list(sys.argv[1])
else:
    quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')


include_count = 0
total = 0
already_added_dict = {}
for fn in sorted(quant_files):
    total += 1

    nzr = non_zero_ratio(fn)
    if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
        fn2 = os.path.basename(fn)
        data_name = 'None'
        if 'inhouse' in fn:
            myid = extract_id(fn, 'inhouse')
            myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
            data_name = 'LAB001' + '_LIB' + myid[-5:]
            desc = 'in-house'
            include_me = True
        elif 'public' in fn:
            myid = extract_id(fn2, 'sra')                
            myid2 = make_id('0',myid[0:3], myid[3:])
            data_name = myid
            desc = 'SRA'
            include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
        elif 'other' in fn:
            myid = extract_id(fn, 'other')
            # get lab id
            dirname = os.path.dirname(fn)
            lst = dirname.split('/')
            lab_id = lst[-2]
            myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
            data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
            desc = 'Other lab'
            include_me = True
        if include_me and not myid2 in already_added_dict:
            print('@%s' % (myid2))
            print('DATA_NAME:%s' % (data_name))
            print('DATA_FORMAT:%s'% ('txt'))
            print('DESCRIPTION:%s' % (desc))
            print('LOCATION:%s' % (fn))
            print('NOTE: non zero ratio is %4.2f' % (nzr))
            print('')
            include_count += 1
            already_added_dict[myid2] = 'yes'
    elif nzr < 0:
        write_log_file('[make_parameter_rnaseq.py] Warning: incomplete line in file %s' % (fn), UPDATE_NETWORK_LOG_FILE)        
    else:
        #print('%s has too many zeroes. ignore.' % (fn))
        pass

print('#Done.  Processed %d files.  Included %d files.' % (total, include_count))