# Usage: python make_upload_chip_parameter.py
#
# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR.
#          Each unique uploaded file will be assigned an ID.
#          The assigned ID starts with C0000, followed by 9 digits.
#          The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used.
#
# TBD: append to PARAMETER_FOR_BUILDCMATRIX
# Created 20 July 2017, slcu, hui

import os, sys, glob
from datetime import datetime

PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt'  # [change]
UPLOAD_DIR = '../Data/upload/chipseq'

INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME'

def good_file(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line.startswith('#') and 'STATUS:' in line:
            if 'SUCC' in line or 'UNKNOWN' in line:
                return True
            if 'FAIL' in line:
                return False
    return False

def already_included(fname):
    ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME'''
    f = open(fname)
    lines = f.readlines()
    n = len(lines)
    f.close()
    for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included.
        line = line.strip()
        if line.startswith('#') and INCLUDE_STAMP in line:
            return True
    return False
        

def same_content(f1, f2):
    ''' Test if two file, f1 and f2, have the same content. '''
    if os.path.exists(f1) and not os.path.exists(f2):
        return False
    if not os.path.exists(f1) and os.path.exists(f2):
        return False
    if not os.path.exists(f1) and not os.path.exists(f2):
        return False
    if os.path.exists(f1) and os.path.exists(f2):
        a = open(f1)
        b = open(f2)
        s1 = ''
        for line in a:
            line = line.strip()
            if not line.startswith('#'): # don't include lines starting with '#'
                s1 += line
        s2 = ''
        for line in b:
            line = line.strip()
            if not line.startswith('#'):
                s2 += line
        a.close()
        b.close()
        if s1 == s2:
            return True
        else:
            return False

def repeat(fname, d):
    ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. '''
    for k in d:
        if same_content(fname, d[k]['LOCATION']):
            return k
    return ''

def update_dict(d, k, fname):
    d[k] = make_chip_info_dict(fname)

# def update_it(upload_dir, upload_dict):
#     id_lst = sorted(upload_dict.keys())
#     if id_lst != []:
#         last_id = id_lst[-1]
#         last_id_number = int(last_id[5:])
#     else:
#         last_id_number = 0
#     for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20.....
#         if good_file(fname) and not already_included(fname):
#             #print(upload_dict)
#             k = repeat(fname, upload_dict)
#             if k == '':
#                 k = '%d' % (last_id_number + 1)
#                 k = 'C0000' + k.zfill(9)
#                 upload_dict[k] = make_chip_info_dict(fname) 
#             else:
#                 update_dict(upload_dict, k, fname)
#             mark_it_as_included(fname)


def make_chip_info_dict(fname):
    ''' Return a dictionary given a user submitted file. '''
    d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''}
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines:
        line = line.strip()
        if line.startswith('#'):
            s = line[(line.rfind('#')+1):]
            s = s.strip()
            lst = s.split(':')
            k = lst[0].strip()
            v = line[(line.find(':')+1):]
            d[k] = v

    d['DATA_NAME'] = os.path.basename(fname)
    d['LOCATION'] = os.path.abspath(fname)
    d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d')
    return d

def mark_it_as_included(fname):
    ''' Prepend a head line including INCLUDE_STAMP'''
    f = open(fname)
    s = f.read()
    f.close()
    f = open(fname, 'w')
    curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
    f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s)
    f.close()

def make_string(d):
    s = ''
    for k in sorted(d.keys()):
        s += '@%s\n' % k
        s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID']
        s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME']
        s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME']
        s += 'DATA_FORMAT:narrowPeak\n'
        s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION']
        s += 'LOCATION:%s\n' % d[k]['LOCATION']
        s += 'NOTE:%s\n\n' % d[k]['NOTE']
    return s

def md(fname):
    ''' Return a dictionary containing the paramter information.  '''
    d = {}
    if not os.path.exists(fname):
        return {}
    else:
        f = open(fname)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            if line != '' and line.startswith('@'):
                k = line[1:]
                d[k] = {}
            elif line != '':
                lst = line.split(':')
                k2 = lst[0].strip()
                v = line[(line.find(':')+1):]
                d[k][k2] = v
        return d

def is_empty(fname):
    ''' Return True if fname has no content. '''
    if os.path.exists(fname):
        f = open(fname)
        s = f.read()
        f.close()
        return s.strip() == ''
    return False

def get_largest_upload_chip_id(fname):
    lst = []
    f = open(fname)
    for line in f:
        line = line.strip()
        if line.startswith('@C0000'):
            lst.append(int(line[2:]))
    f.close()
    if lst != []:
        return max(lst)
    else:
        return 0
    
def make_upload_dict(param_fname, included_path):
    d = {}
    i = get_largest_upload_chip_id(param_fname) + 1 # staring id
    for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first
        k = 'C0000' + ('%d' % (i)).zfill(9)
        if good_file(fn) and not already_included(fn) and repeat(fn, d) == '':
            d[k] = make_chip_info_dict(fn)
            i += 1
        if good_file(fn):
            mark_it_as_included(fn)
        cmd = 'mv %s %s' % (fn, included_path) 
        os.system(cmd)
            
    return d
                
def append_to_file(fname, s):
    f = open(fname, 'a')
    f.write('\n' + s + '\n')
    f.close()

def make_directory(my_dir):
    if not os.path.exists(my_dir):
        os.makedirs(my_dir)

def make_copy(fname):
    if os.path.exists(fname):
        curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
        new_fname = fname + '.copy.%s' % (curr_time)
        f = open(fname)
        s = f.read()
        f.close()
        f = open(new_fname, 'w')
        f.write(s)
        f.close()
        
## main
included_path = os.path.join(UPLOAD_DIR, 'included')
make_directory(included_path)
upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path)
s = make_string(upload_dict)
if s != '':
    # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it
    make_copy(PARAMETER_FOR_BUILDCMATRIX)
    append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)