#! /usr/bin/python3
# Usage: python3 update_network.py
#        Put this script under directory Code/.
#        IMPORTANT: Run this script under directory Code/. 
#        Execute the above command regularly, or
#        Cron job this command to make it run everyday at 5am:
#
#        1.  crontab -e.
#        2.  Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py
#
# IMPORTANT: Make sure execute this script (update_network.py) under the directory Code.
# 
# Purpose: periodically (e.g., per week) run this script to see if the network needs update.  If yes, update it.
#
# Set HOLDON=NO in parameter_for_buildCmatrix.txt,
# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make
# changes in these file effective.
#
# parameter_for_buildCmatrix.txt will be updated automatically (I
# hope).  However, we need to update parameter_for_buildCmatrix.txt
# manually.
#
# Revision history:
#
# Last modified: 26 Feb 2017
# Last modified: 17 Mar 2017
# Last modified: 04 Apr 2017
# Last modified: 05 Apr 2017
# Last modified: 10 Apr 2017
# Last modified: 19 Apr 2017
# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R]
# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R]
# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ]
# Last modified: 23 Jan 2018 [edited a few print-out messages]
# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement]
# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)]
# Last modified: 03 Feb 2019
# Last modified: 08 Aug 2019, hui
# Last modified: 10 Aug 2019, hui <lanhui@zjnu.edu.cn>
# Last modified: 23 Aug 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools(num_component)]
# Last modified: 10 Sep 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools, check the previous R session has finished before starting a new one.]

import os, sys
import numpy as np
import glob
import time
import subprocess
from datetime import datetime
from param4net import make_global_param_dict, get_key_value
from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \
    PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \
    PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \
    BINDING_FILE, TPM_FILE, \
    PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \
    RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \
    MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \
    EDGE_POOL_DIR, MERGED_EDGE_FILE, \
    TARGET_TF_FILE



## Helper functions

def get_value(s, delimit):
    lst = s.split(delimit, 1) # only split at the first delimit    
    return lst[1].strip()


def validate_webapp_dir(para_for_net):
    ''' Make sure this function is executed under the directory Code. '''
    glb_param_dict = make_global_param_dict(para_for_net)
    # if genes.json is not present, create one
    if not os.path.exists('../Webapp/static/json/genes.json'):
        print('[update_network.py]: cannot find genes.json, make one ...')
        cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
        os.system(cmd)

    
def make_paths(s):
    if not os.path.isdir(s):
        os.makedirs(s)


def make_important_dirs():
    make_paths('../Data/history/edges/many_targets')
    make_paths('../Data/history/edges/one_target')
    make_paths('../Data/log')
    make_paths('../Data/information')    
    make_paths('../Data/temp')
    make_paths('../Data/upload')        
    make_paths('../Data/parameter')
    make_paths('../Data/R/Mapped')
    make_paths('../Data/R/Mapped/public')
    make_paths('../Data/R/Mapped/inhouse')
    make_paths('../Data/R/Mapped/other')
    make_paths('../Data/R/Raw')
    make_paths('../Data/C/Mapped')
    make_paths('../Data/C/Raw')    
    make_paths('../Data/history/edges')
    make_paths('../Data/history/edge_pool')    
    make_paths('../Data/history/bind')
    make_paths('../Data/history/expr')
    make_paths('../Webapp/static/json')
    make_paths('../Webapp/static/edges')    
    make_paths('../Webapp/templates')    

    
def num_line(fname):
    ''' Return number of lines in file fname. '''
    if not os.path.exists(fname):
        return 0
    f = open(fname)
    lines = f.readlines()
    f.close()
    return len(lines)


def num_ids(fname):
    ''' Return number of IDs in fname. '''
    f = open(fname)
    lines = f.readlines()
    f.close()
    return len(lines[0].split('\t')) - 1


def write_log_file(s, fname):
    f = open(fname, 'a')
    curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
    s = '[' + curr_time + ']: ' + s
    if not '\n' in s:
        s += '\n'
    f.write(s)
    f.close()
    print('Log: %s' % (s.strip()))


def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
    if not os.path.exists(sample_size_file):
        f = open(sample_size_file, 'w')
    else:
        f = open(sample_size_file, 'a')
    f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
    f.close()


def age_of_file_in_days(fname):
    ''' Return age of fname in days. '''
    st = os.stat(fname)
    days = (time.time() - st.st_mtime)/(3600*24.0)
    return days


def age_of_file_in_seconds(fname):
    ''' Return age of fname in days. '''
    st = os.stat(fname)
    seconds = time.time() - st.st_mtime
    return seconds


def hold_on(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines[:100]: # check the first 100 lines for HOLDON
        line = line.strip()
        if line.startswith('%%HOLDON=YES'):
            return True
    return False
    

def all_files_present(lst):
    missing_file_lst = []
    for path in lst: # lst is a list of file names to check
        if not os.path.exists(path):
            if 'edges.txt' in path:
                write_log_file('[update_network.py] WARNING: must have %s to update network.  Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE)
            missing_file_lst.append(path)
    return missing_file_lst


def record_file_time(lst, fname):
    '''
        lst - a list of files
        fname - a recorder file
    '''
    f = open(fname, 'w')
    s = ''
    for x in lst:
        if os.path.exists(x):
            s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
        else:
            s += '%s\t%d\n' % (os.path.basename(x), 0)
    f.write(s)
    f.close()
        

def read_file_timestamp(ftimestamp):
    d = {}
    f = open(ftimestamp)
    for line in f:
        line = line.strip()
        lst = line.split()
        fname = lst[0]
        t     = lst[1]
        d[fname]  = int(t)

    f.close()        
    return d


def file_updated(fname, d):
    ft = int(os.stat(fname).st_mtime)
    k = os.path.basename(fname)
    return ft > d[k]


def get_updated_files(lst, d):
    result = []
    for x in lst:
        if file_updated(x, d):
            result.append(os.path.basename(x))
    return result


def get_sample_size(d, sorted_keys, day, rcond_string):
    
    if rcond_string.isdigit():
        return int(rcond_string)

    if len(d) == 0:
        return 1200 # a default number of sample size, CHANGE

    for x in sorted_keys:
        if x >= day:
            return d[x]

    k = sorted_keys[-1] # last key, latest date
    return d[k]


def number_rnaseq_id(tpm_file):
    f = open(tpm_file)
    first_line = f.readlines()[0]
    f.close()
    first_line = first_line.strip()
    return len(first_line.split()) - 1

    
def number_rnaseq_diff(para_file, tpm_file):
    ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
    a = 0
    f = open(para_file)
    for line in f:
        line = line.strip()
        if line.startswith('@'):
            a += 1
    f.close()

    b = number_rnaseq_id(tpm_file)

    return a - b


def validate_gene_file(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines: # check all lines
        line = line.strip()
        lst = line.split('\t')
        if len(lst) < 6:
            print('[update_network.py]:Not enought fields: %s.  Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional).  See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
            sys.exit()

            
def validate_parameter_for_buildcmatrix(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_FILE' or k == 'CHR_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
                if k == 'GENE_FILE':
                    validate_gene_file(v)
            if k == 'DESTINATION':
                if not os.path.isdir(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'TARGET_RANGE':
                if int(v) <= 0:
                    print('[update_network.py]:Target range (%d) must be greater than 0.' % (v))
                    sys.exit()
        if line.startswith('LOCATION:'):
            v = get_value(line, ':')
            location_count += 1
            if not os.path.exists(v):
                print('[Warning] update_network.py: Location %s does not exists.' % (v))
                #sys.exit()

    if not 'GENE_FILE' in d:
        print('[update_network.py]:Must specify GENE_FILE.')
        sys.exit()
    if not 'DESTINATION' in d:
        print('[update_network.py]:Must specify DESTINATION.')
        sys.exit()
    if not 'CHR_INFO' in d:
        print('[update_network.py]:Must specify CHR_INFO.')
        sys.exit()        
    if location_count == 0:
        print('[update_network.py]:Must contain at least one ChIP-seq.')
        sys.exit()
        

def validate_parameter_for_buildrmatrix(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()    
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_LIST':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
        if line.startswith('LOCATION:'):
            v = get_value(line, ':')
            location_count += 1
            if not os.path.exists(v):
                print('[update_network.py]:Location %s does not exists.' % (v))
                #sys.exit()

    if not 'GENE_LIST' in d:
        print('[update_network.py]:Must specify GENE_LIST.')
        sys.exit()
    if location_count == 0:
        print('[update_network.py]:Must contain at least one RNA-seq.')
        sys.exit()


def validate_parameter_for_net(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()    
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_LIST':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'GENE_ID_AND_GENE_NAME':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'BINDING_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()                
            if k == 'EXPRESSION_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()                
            if k == 'BINDING_MATRIX':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
            if k == 'EXPRESSION_MATRIX':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')

    if not 'GENE_LIST' in d:
        print('[update_network.py]:Must specify GENE_FILE.')
        sys.exit()
    if not 'GENE_ID_AND_GENE_NAME' in d:
        print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.')
        sys.exit()
    if not 'BINDING_INFO' in d:
        print('[update_network.py]:Must specify BINDING_INFO.')
        sys.exit()
    if not 'EXPRESSION_INFO' in d:
        print('[update_network.py]:Must specify EXPRESSION_INFO.')
        sys.exit()
    if not 'BINDING_MATRIX' in d:
        print('[update_network.py]:%s not exists.' % (v))
        print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
    if not 'EXPRESSION_MATRIX' in d:
        print('[update_network.py]:%s not exists.' % (v))
        print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
        


def need_update_parameter_file(param_file, dirs):
    ''' Make sure param_file is consistent with dirs (a list of directories to check against). '''
    result = []
    
    files_in_parameter = {}
    f = open(param_file)
    for line in f:
        line = line.strip()
        if line.startswith('LOCATION:'):
            lst = line.split(':')
            k = os.path.abspath(lst[1])
            files_in_parameter[k] = 1
    f.close()
    param_modification_time = os.path.getmtime(param_file)
    
    files_in_dirs = {}
    for directory in dirs:
        for root, dirnames, filenames in os.walk(os.path.abspath(directory)):
            for filename in filenames:
                k = os.path.join(root, filename)
                files_in_dirs[k] = 1
                if 'narrowPeak' in k or '_quant' in k:
                    if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time:
                        result.append('%s is not in %s' % (k, param_file))

    return result



def validate_binding_file(fname):
    f = open(fname)
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if 'buildCmatrix: ChIP-seq ID list is empty.' in line:
            return False
    f.close()
    return True


def lines_with_10_fields(s):
    result = []
    for line in s.split('\n'):
        line = line.strip()
        if len(line.split('\t')) == 10:
            result.append(line)
    return result


def concatenate_edge_files(fname_lst, fname_out):
    fout = open(fname_out, 'w')
    for fname in fname_lst:
        f = open(fname)
        s = f.read()
        f.close()
        # Make sure each edge has 10 fields before writing.
        lines = lines_with_10_fields(s)
        if lines != []:
            write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
            fout.write('\n'.join(lines) + '\n')
        else:
            write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s.  It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)            
    fout.close()
        

def delete_edge_files(fname_lst):
    for fname in fname_lst:
        # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
        if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes
            os.remove(fname)
        else:
            write_log_file('[update_network.py] In function delete_edge_files. Check file %s.  It is probably still being written.  So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE)


def create_edges0():
    if os.path.exists(PARAMETER_FOR_NET):    
        write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET)
        os.system(cmd)

    # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables.
    if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL)
        #os.system(cmd)


def create_edges0B():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
        os.system(cmd)


def wedge():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE)    
        cmd = 'Rscript wedge.R'
        os.system(cmd)


def correlation_per_group():
    # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours.
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript correlation_per_group.R'
        os.system(cmd)


def correlation_per_group_fixed_number():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript correlation_per_group_fixed_number.R'
        os.system(cmd)


def correlation_mixtools(num_component):
    if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one.  os.system returns 0 if R is running.
        write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d).  create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript create_edges_mixtool.R %d' % (num_component)
        os.system(cmd)


def check_rnaseq_info():
    # check rnaseq_info_database.txt and rnaseq_info_database.json, if they are outdated, then remind us to update it in log file.
    if os.path.exists(RNA_SEQ_INFO_DATABASE):
        if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 90: # older than 120 days
            write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE)
    else:
        write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE)
    
    if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
        if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 90:
            write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE)
    else:
        write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE)


# def check_process(name):
#     ''' If a process name exists, return 1; otherwise return 0.'''
#     os.system('ps -eF | grep \'%s\' > ../Data/running_processes.txt' % (name))
#     f = open('../Data/running_processes.txt')
#     lines = f.readlines()
#     f.close()    
#     for line in lines:
#         line = line.strip()
#         lst = line.split()
#         if 'python' in lst[-2] and name in lst[-1]:
#             return 1
#     return 0



## main

# if check_process('update_network.py') == 1: # the old update_network.py is running
#     write_log_file('[update_network.py] update_network.py has not finished yet.', UPDATE_NETWORK_LOG_FILE)    
#     sys.exit()



FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \
                      MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of very important files

make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR)
#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json.

check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots 

# Make sure all necessary files are present, if not, make them if possible
miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
if miss_lst != []: # miss_lst is non-empty in the beginning.
    print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst)))    
    write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE)

    # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ...
    important_miss_number = 0
    if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX))
        important_miss_number += 1
    
    if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX))
        important_miss_number += 1

    if PARAMETER_FOR_NET in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET))
        important_miss_number += 1

    if important_miss_number > 0:
        sys.exit() # need to provide all the above three files; otherwise cannot proceed

    if BINDING_FILE in miss_lst:
        print('[update_network.py]: make initial binding.txt ... wait')
        write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
        #os.system(cmd)
        cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
        #os.system(cmd)
        print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
        sys.exit()
    
    if TPM_FILE in miss_lst:
        print('[update_network.py]: make initial TPM.txt ... wait')        
        write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE)        
        cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt
        os.system(cmd)
        print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
        sys.exit()

    miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
    if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one.
        print('[update_network.py]: make initial edges.txt ... wait')
        create_edgeds0()


# Make json2 (sliced binding.txt) if it does not exist.  Copy json2 to
# the web application folder static/edges [manual] for displaying
# binding strength plots.
if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE):
    write_log_file('Make directory ../Data/history/bind/json2.  Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
    os.system(cmd)


# Make json (sliced TPM.txt) if it does not exist.  Copy json to the
# web application folder static/edges [manual] for displaying gene
# expression scatterplots.
if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE):
    write_log_file('Make directory ../Data/history/expr/json.  Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
    os.system(cmd)


# Make sure parameter files are present and valid (rudimentary check but important)
validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
validate_parameter_for_net(PARAMETER_FOR_NET)


# If the file timestamp does not exist, create one
if not os.path.exists(FILE_TIMESTAMP): 
    record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)

# get update time of mandatory files
timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)



################## binding.txt stuff #####################################
# Check parameter_for_buildCmatrix.txt
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX):
    write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE)        
    cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated
    os.system(cmd)


    # # We will only consider ChIP-seq IDs that are less than 7 days
    # # old.  Make sure put 'update:yymmdd' in the 'NOTE:' field in 
    # # parameter_for_buildCmatrix.txt for each newly added ChIP-seq
    # # data.
    # write_log_file('[update_network.py] Build binding matrix from recently added/modified ChIP-seq data.', UPDATE_NETWORK_LOG_FILE)    
    # TEMP_BINDING_FILE = BINDING_FILE + '.temp'    
    # cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, TEMP_BINDING_FILE) 
    # os.system(cmd)

    # # If someone just touched prameter_for_buildCmatrix.txt without
    # # adding any new ChIP-seq data, we should do nothing.
    # if validate_binding_file(TEMP_BINDING_FILE):
    #     write_log_file('[update_network.py] Overwrite binding.txt.', UPDATE_NETWORK_LOG_FILE)            
    #     cm = 'mv %s %s' (TEMP_BINDING_FILE, BINDING_FILE) # Overwrite binding.txt. Make it formal.
    #     os.system(cmd)
    #     write_log_file('[update_network.py] binding.txt is updated.  Number of columns in %s = %d.' % (BINDING_FILE, num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
        
    #     write_log_file('[update_network.py] Update target tf file %s.' % (TARGET_TF_FILE), UPDATE_NETWORK_LOG_FILE)
    #     cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
    #     os.system(cmd)
    # else:
    #     write_log_file('[update_network.py] [WARNING] Invalid binding matrix.', UPDATE_NETWORK_LOG_FILE)
    # os.remove(TEMP_BINDING_FILE)


updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'binding.txt' in updated_file_list:
    write_log_file('[update_network.py] binding.txt has been updated.  This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE)
    # create_edges0()
    # create_edges0B()
    # wedge()
    # correlation_per_group()
    # correlation_per_group_fixed_number()
    # correlation_mixtools(2)
    # correlation_mixtools(3)    
    
    ## TODO mixtool stuff, forget it for now.
    #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name)
    #os.system(cmd)




################## TPM.txt stuff #####################################    

# update parameter_for_buildRmatrix.txt periodically and automatically.
if datetime.now().day % PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly
    curr_time = datetime.now().strftime('%Y%m%d%H%M')
    new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time)
    cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished.
    os.system(cmd)
    num = number_rnaseq_diff(new_parameter_file, TPM_FILE)
    if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added
        write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
        cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX)
        os.system(cmd)
        
        # Before we rewrite TPM.txt, we should backup the old TPM.txt
        # write_log_file('[update_network.py] Backup %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)        
        # cmd = 'cp %s %s' % (TPM_FILE, TPM_FILE + '.backup.at.' + curr_time)
        # os.system(cmd)

        # write_log_file('[update_network.py] Rebuild %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)
        # cmd = 'python3 buildRmatrix.py ../Data/parameter/parameter_for_buildRmatrix.txt'
        # os.system(cmd)

    else:
        write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt.  TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE)


# Check if parameter_for_buildRmatrix.txt has been updated
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
# TODO To simplify things, I will provide TPM.txt directly. So set the
# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent
# the following from being True.
if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX):
    write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)        
    write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE)
    curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
    if os.path.exists(TPM_FILE):
        backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time)
        cmd = 'cp %s %s' % (TPM_FILE, backup_file_name)
        os.system(cmd)
        cmd = 'gzip %s' % (backup_file_name)
        os.system(cmd)

    cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, the location of which is specified in TPM_TABLE in buidlRmatrix.py
    os.system(cmd)

    curr_date = datetime.now().strftime('%Y%m%d')
    tpm_sample_size = number_rnaseq_id(TPM_FILE)
    write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size)
    


# Create edges using all RNA-seq experiments
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent.  We will recompute edges using the full binding.txt.
    # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges
    write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs.  Pay attention to include-all in the command-line argument.
    os.system(cmd)
    
    # target_tf.txt
    write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE)    
    cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
    os.system(cmd)

    write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt.  Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE)    
    ## json -- make/renew json directory for displaying scatterplots
    cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
    ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots
    write_log_file('[update_network.py] Update directory ../Data/history/bind/json2.  Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
    #os.system(cmd) # turn this on if we are going to use this bindingtxt for displaying bar charts of binding strengths
    ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual]

    if False:  # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster.
        write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE)
        os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead.   The export command can be put in crontab -e before running this script
        cmd = 'python3 assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt'
        os.system(cmd)
        cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt'
        os.system(cmd)
        cmd = 'python3 update_rnaseq_info_json.py'
        os.system(cmd)



    # Compute edges.  This could take a lot of time so update FILE_TIMESTAMP first.
    record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
    create_edges0()
    create_edges0B()
    wedge()
    correlation_per_group()
    correlation_per_group_fixed_number()
    correlation_mixtools(2)
    #correlation_mixtools(3)    


########## Merge edges #######################
# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
time.sleep(5)
edge_file_lst = [] # collect edge files.
most_recent_edge_modification_time = 0
write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
    edge_file_lst.append(fname)
    if os.path.getmtime(fname) > most_recent_edge_modification_time:
        most_recent_edge_modification_time = os.path.getmtime(fname)

write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
    edge_file_lst.append(fname)
    if os.path.getmtime(fname) > most_recent_edge_modification_time:
        most_recent_edge_modification_time = os.path.getmtime(fname)


if edge_file_lst == []:
    write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
    # concatenate edge files into one
    write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)    
    curr_time = datetime.now().strftime('%Y%m%d_%H%M')
    concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
    delete_edge_files(edge_file_lst)

if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
    write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)            
    cmd = 'python3 merge_edges.py'
    os.system(cmd)
    write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)        
    manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges'    
    write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)    
    

# exclude edges as suggested by Phil Wigge.
# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
#os.system(cmd)


# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) 
# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
#     write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
#     time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname
#     cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET)  # put process to background
#     os.system(cmd)
#     time.sleep(60)


# remove .R files in ../Data/temp. Files older than 3 days will be removed
cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR)
os.system(cmd)

# update time stamp file
record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)

write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)