summaryrefslogtreecommitdiff
path: root/Code/update_network.py
blob: 7b26f58967d8bd4a530304bdbfff00932293b14d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
#! /usr/bin/python3
# Usage: python3 update_network.py
#        Put this script under directory Code/.
#        IMPORTANT: Run this script under directory Code/. 
#        Execute the above command regularly, or
#        Cron job this command to make it run everyday at 5am:
#
#        1.  crontab -e.
#        2.  Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py
#
# IMPORTANT: Make sure that you execute this script (update_network.py) under the directory Code.
# 
# Purpose: periodically (e.g., per week) run this script to see if the network needs update.  If yes, update it.
#
# Set HOLDON=NO in parameter_for_buildCmatrix.txt,
# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make
# changes in these file effective.
#
# parameter_for_buildRmatrix.txt will be updated automatically (I
# hope).  However, we need to update parameter_for_buildCmatrix.txt
# manually.
#
# Revision history:
#
# Last modified: 26 Feb 2017
# Last modified: 17 Mar 2017
# Last modified: 04 Apr 2017
# Last modified: 05 Apr 2017
# Last modified: 10 Apr 2017
# Last modified: 19 Apr 2017
# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R]
# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R]
# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ]
# Last modified: 23 Jan 2018 [edited a few print-out messages]
# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement]
# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)]
# Last modified: 03 Feb 2019
# Last modified: 08 Aug 2019, hui
# Last modified: 10 Aug 2019, hui <lanhui@zjnu.edu.cn>
# Last modified: 23 Aug 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools(num_component)]
# Last modified: 10 Sep 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools, check the previous R session has finished before starting a new one.]

import os, sys
import numpy as np
import glob
import time
import subprocess
from datetime import datetime
from param4net import make_global_param_dict, get_key_value
from log import write_log_file
from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \
    PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \
    PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \
    BINDING_FILE, TPM_FILE, \
    BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \
    RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \
    MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \
    EDGE_POOL_DIR, MERGED_EDGE_FILE, \
    TARGET_TF_FILE



## Helper functions

def get_value(s, delimit):
    lst = s.split(delimit, 1) # only split at the first delimit    
    return lst[1].strip()


def validate_webapp_dir(para_for_net):
    ''' Make sure this function is executed under the directory Code. '''
    glb_param_dict = make_global_param_dict(para_for_net)
    # if genes.json is not present, create one
    if not os.path.exists('../Webapp/static/json/genes.json'):
        print('[update_network.py]: cannot find genes.json, make one ...')
        cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
        os.system(cmd)

    
def make_paths(s):
    if not os.path.isdir(s):
        os.makedirs(s)


def make_important_dirs():
    make_paths('../Data/history/edges/many_targets')
    make_paths('../Data/history/edges/one_target')
    make_paths('../Data/log')
    make_paths('../Data/information')    
    make_paths('../Data/temp')
    make_paths('../Data/upload')        
    make_paths('../Data/parameter')
    make_paths('../Data/R/Mapped')
    make_paths('../Data/R/Mapped/public')
    make_paths('../Data/R/Mapped/inhouse')
    make_paths('../Data/R/Mapped/other')
    make_paths('../Data/R/Raw')
    make_paths('../Data/C/Mapped')
    make_paths('../Data/C/Raw')    
    make_paths('../Data/history/edges')
    make_paths(EDGE_POOL_DIR)    
    make_paths('../Data/history/bind')
    make_paths('../Data/history/expr')
    make_paths('../Webapp/static/json')
    make_paths('../Webapp/static/edges')    
    make_paths('../Webapp/templates')    

    
def num_line(fname):
    ''' Return number of lines in file fname. '''
    if not os.path.exists(fname):
        return 0
    f = open(fname)
    lines = f.readlines()
    f.close()
    return len(lines)


def num_ids(fname):
    ''' Return number of IDs in fname. '''
    f = open(fname)
    lines = f.readlines()
    f.close()
    return len(lines[0].split('\t')) - 1


def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
    if not os.path.exists(sample_size_file):
        f = open(sample_size_file, 'w')
    else:
        f = open(sample_size_file, 'a')
    f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
    f.close()


def age_of_file_in_days(fname):
    ''' Return age of fname in days. '''
    st = os.stat(fname)
    days = (time.time() - st.st_mtime)/(3600*24.0)
    return days


def age_of_file_in_seconds(fname):
    ''' Return age of fname in days. '''
    st = os.stat(fname)
    seconds = time.time() - st.st_mtime
    return seconds


def hold_on(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines[:100]: # check the first 100 lines for HOLDON
        line = line.strip()
        if line.startswith('%%HOLDON=YES'):
            return True
    return False
    

def all_files_present(lst):
    missing_file_lst = []
    for path in lst: # lst is a list of file names to check
        if not os.path.exists(path):
            if 'edges.txt' in path:
                write_log_file('[update_network.py] WARNING: must have %s to update network.  Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE)
            missing_file_lst.append(path)
    return missing_file_lst


def record_file_time(lst, fname):
    '''
        lst - a list of files
        fname - a recorder file
    '''
    f = open(fname, 'w')
    s = ''
    for x in lst:
        if os.path.exists(x):
            s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
        else:
            s += '%s\t%d\n' % (os.path.basename(x), 0)
    f.write(s)
    f.close()
        

def read_file_timestamp(ftimestamp):
    d = {}
    f = open(ftimestamp)
    for line in f:
        line = line.strip()
        lst = line.split()
        fname = lst[0]
        t     = lst[1]
        d[fname]  = int(t)

    f.close()        
    return d


def file_updated(fname, d):
    ft = int(os.stat(fname).st_mtime)
    k = os.path.basename(fname)
    return ft > d[k]


def get_updated_files(lst, d):
    result = []
    for x in lst:
        if file_updated(x, d):
            result.append(os.path.basename(x))
    return result


def get_sample_size(d, sorted_keys, day, rcond_string):
    
    if rcond_string.isdigit():
        return int(rcond_string)

    if len(d) == 0:
        return 1200 # a default number of sample size, CHANGE

    for x in sorted_keys:
        if x >= day:
            return d[x]

    k = sorted_keys[-1] # last key, latest date
    return d[k]


def number_rnaseq_id(tpm_file):
    f = open(tpm_file)
    first_line = f.readlines()[0]
    f.close()
    first_line = first_line.strip()
    return len(first_line.split()) - 1

    
def number_rnaseq_diff(para_file, tpm_file):
    ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
    a = 0
    f = open(para_file)
    for line in f:
        line = line.strip()
        if line.startswith('@'):
            a += 1
    f.close()

    b = number_rnaseq_id(tpm_file)

    return a - b


def validate_gene_file(fname):
    f = open(fname)
    lines = f.readlines()
    f.close()
    for line in lines: # check all lines
        line = line.strip()
        lst = line.split('\t')
        if len(lst) < 6:
            print('[update_network.py]:Not enought fields: %s.  Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional).  See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
            sys.exit()

            
def validate_parameter_for_buildcmatrix(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_FILE' or k == 'CHR_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
                if k == 'GENE_FILE':
                    validate_gene_file(v)
            if k == 'DESTINATION':
                if not os.path.isdir(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'TARGET_RANGE':
                if int(v) <= 0:
                    print('[update_network.py]:Target range (%d) must be greater than 0.' % (v))
                    sys.exit()
        if line.startswith('LOCATION:'):
            v = get_value(line, ':')
            location_count += 1
            if not os.path.exists(v):
                print('[Warning] update_network.py: Location %s does not exists.' % (v))
                #sys.exit()

    if not 'GENE_FILE' in d:
        print('[update_network.py]:Must specify GENE_FILE.')
        sys.exit()
    if not 'DESTINATION' in d:
        print('[update_network.py]:Must specify DESTINATION.')
        sys.exit()
    if not 'CHR_INFO' in d:
        print('[update_network.py]:Must specify CHR_INFO.')
        sys.exit()        
    if location_count == 0:
        print('[update_network.py]:Must contain at least one ChIP-seq.')
        sys.exit()
        

def validate_parameter_for_buildrmatrix(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()    
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_LIST':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
        if line.startswith('LOCATION:'):
            v = get_value(line, ':')
            location_count += 1
            if not os.path.exists(v):
                print('[update_network.py]:Location %s does not exists.' % (v))
                #sys.exit()

    if not 'GENE_LIST' in d:
        print('[update_network.py]:Must specify GENE_LIST.')
        sys.exit()
    if location_count == 0:
        print('[update_network.py]:Must contain at least one RNA-seq.')
        sys.exit()


def validate_parameter_for_net(fname):
    # first the file must exist
    if not os.path.exists(fname):
        print('[update_network.py]:CANNOT FIND %s.' % (fname))
        sys.exit()    
    f = open(fname)
    lines = f.readlines()
    f.close()
    d = {}
    location_count = 0
    for line in lines:
        line = line.strip()
        if line.startswith('%%'):
            k, v = get_key_value(line[2:])
            d[k] = v
            if k == 'GENE_LIST':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'GENE_ID_AND_GENE_NAME':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()
            if k == 'BINDING_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()                
            if k == 'EXPRESSION_INFO':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    sys.exit()                
            if k == 'BINDING_MATRIX':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
            if k == 'EXPRESSION_MATRIX':
                if not os.path.exists(v):
                    print('[update_network.py]:%s not exists.' % (v))
                    print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')

    if not 'GENE_LIST' in d:
        print('[update_network.py]:Must specify GENE_FILE.')
        sys.exit()
    if not 'GENE_ID_AND_GENE_NAME' in d:
        print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.')
        sys.exit()
    if not 'BINDING_INFO' in d:
        print('[update_network.py]:Must specify BINDING_INFO.')
        sys.exit()
    if not 'EXPRESSION_INFO' in d:
        print('[update_network.py]:Must specify EXPRESSION_INFO.')
        sys.exit()
    if not 'BINDING_MATRIX' in d:
        print('[update_network.py]:%s not exists.' % (v))
        print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
    if not 'EXPRESSION_MATRIX' in d:
        print('[update_network.py]:%s not exists.' % (v))
        print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
        


def need_update_parameter_file(param_file, dirs):
    ''' Make sure param_file is consistent with dirs (a list of directories to check against). '''
    result = []
    
    files_in_parameter = {}
    f = open(param_file)
    for line in f:
        line = line.strip()
        if line.startswith('LOCATION:'):
            lst = line.split(':')
            k = os.path.abspath(lst[1])
            files_in_parameter[k] = 1
    f.close()
    param_modification_time = os.path.getmtime(param_file)
    
    files_in_dirs = {}
    for directory in dirs:
        for root, dirnames, filenames in os.walk(os.path.abspath(directory)):
            for filename in filenames:
                k = os.path.join(root, filename)
                files_in_dirs[k] = 1
                if 'narrowPeak' in k or '_quant' in k:
                    if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time:
                        result.append('%s is not in %s' % (k, param_file))

    return result


def validate_binding_file(fname):
    f = open(fname)
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if 'buildCmatrix: ChIP-seq ID list is empty.' in line:
            return False
    f.close()
    return True


def create_edges0():
    if os.path.exists(PARAMETER_FOR_NET):    
        write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET)
        os.system(cmd)

    # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables.
    if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT)
        #os.system(cmd)

    if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL):
        #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL)
        #os.system(cmd)


def create_edges0B():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
        os.system(cmd)


def wedge():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE)    
        cmd = 'Rscript wedge.R'
        os.system(cmd)


def correlation_per_group():
    # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours.
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript correlation_per_group.R'
        os.system(cmd)


def correlation_per_group_fixed_number():
    if os.path.exists(PARAMETER_FOR_NET):
        write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript correlation_per_group_fixed_number.R'
        os.system(cmd)


def correlation_mixtools(num_component):
    if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one.  os.system returns 0 if R is running.
        write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d).  create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE)
        cmd = 'Rscript create_edges_mixtool.R %d' % (num_component)
        os.system(cmd)


def check_rnaseq_info():
    # check rnaseq_info_database.txt and rnaseq_info_database.json. If they are outdated, then remind us to update it in log file.
    if os.path.exists(RNA_SEQ_INFO_DATABASE):
        if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 120: # older than 120 days
            write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE)
    else:
        write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE)
    
    if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
        if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 120:
            write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE)
    else:
        write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE)


# main
FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \
                      MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of important files

make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR)
#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json.

check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots 

# Make sure all necessary files are present, if not, make them if possible
miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
if miss_lst != []: # miss_lst is non-empty in the beginning.
    print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst)))    
    write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE)

    # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ...
    important_miss_number = 0
    if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX))
        important_miss_number += 1
    
    if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX))
        important_miss_number += 1

    if PARAMETER_FOR_NET in miss_lst:
        print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET))
        important_miss_number += 1

    if important_miss_number > 0:
        sys.exit() # need to provide all the above three files; otherwise cannot proceed

    if BINDING_FILE in miss_lst:
        print('[update_network.py]: make initial binding.txt ... wait')
        write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE)
        cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
        #os.system(cmd)
        cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
        #os.system(cmd)
        print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
        sys.exit()
    
    if TPM_FILE in miss_lst:
        print('[update_network.py]: make initial TPM.txt ... wait')        
        write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE)        
        cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt
        os.system(cmd)
        print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
        sys.exit()

    miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
    if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one.
        print('[update_network.py]: make initial edges.txt ... wait')
        create_edgeds0()


# Make json2 (sliced binding.txt) if it does not exist.  Copy json2 to
# the web application folder static/edges [do it manually] for displaying
# binding strength plots.
if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE):
    write_log_file('Make directory ../Data/history/bind/json2.  Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
    os.system(cmd)


# Make json (sliced TPM.txt) if it does not exist.  Copy json to the
# web application folder static/edges [manual] for displaying gene
# expression scatterplots.
if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE):
    write_log_file('Make directory ../Data/history/expr/json.  Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
    os.system(cmd)


# Make sure parameter files are present and valid (rudimentary check but important)
validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
validate_parameter_for_net(PARAMETER_FOR_NET)


# If the file timestamp does not exist, create one
if not os.path.exists(FILE_TIMESTAMP): 
    record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)

# Get update time of mandatory files
timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)



################## binding.txt stuff #####################################
# Check parameter_for_buildCmatrix.txt
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX):
    write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE)
    write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE)        
    cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated
    os.system(cmd)


updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'binding.txt' in updated_file_list:
    write_log_file('[update_network.py] binding.txt has been updated.  This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE)
    # create_edges0()
    # create_edges0B()
    # wedge()
    # correlation_per_group()
    # correlation_per_group_fixed_number()
    # correlation_mixtools(2)
    # correlation_mixtools(3)    
    
    ## TODO mixtool stuff, forget it for now.
    #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name)
    #os.system(cmd)




################## TPM.txt stuff #####################################    

# update parameter_for_buildRmatrix.txt periodically and automatically.
if datetime.now().day % BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly
    curr_time = datetime.now().strftime('%Y%m%d%H%M')
    new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time)
    cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished.
    os.system(cmd)
    num = number_rnaseq_diff(new_parameter_file, TPM_FILE)
    if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added
        write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
        cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX)
        os.system(cmd)
    else:
        write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt.  TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE)


# Check if parameter_for_buildRmatrix.txt has been updated
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
# TODO To simplify things, I will provide TPM.txt directly. So set the
# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent
# the following from being True.
if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX):
    write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)        
    write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE)
    curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
    if os.path.exists(TPM_FILE):
        backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time)
        cmd = 'cp %s %s' % (TPM_FILE, backup_file_name)
        os.system(cmd)
        cmd = 'gzip %s' % (backup_file_name)
        os.system(cmd)

    cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, whose location is specified in TPM_TABLE in buidlRmatrix.py
    os.system(cmd)

    curr_date = datetime.now().strftime('%Y%m%d')
    tpm_sample_size = number_rnaseq_id(TPM_FILE)
    write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size)
    


# Create edges using all RNA-seq experiments
updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent.  We will recompute edges using the full binding.txt.
    # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges
    write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs.  Pay attention to include-all in the command-line argument.
    os.system(cmd)
    
    # target_tf.txt
    write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE)    
    cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
    os.system(cmd)

    write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt.  Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE)    
    ## json -- make/renew json directory for displaying scatterplots
    cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
    ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots
    write_log_file('[update_network.py] Update directory ../Data/history/bind/json2.  Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
    cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
    #os.system(cmd) # turn this on if we are going to use this binding.txt for displaying bar charts of binding strengths
    ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual]

    if False:  # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster.
        write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE)
        os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead.   The export command can be put in crontab -e before running this script
        cmd = 'python3 assign_tissue.py'
        os.system(cmd)
        cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt'
        os.system(cmd)
        cmd = 'python3 update_rnaseq_info_json.py'
        os.system(cmd)



    # Compute edges.  This could take a lot of time so update FILE_TIMESTAMP first.
    record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
    create_edges0()
    create_edges0B()
    wedge()
    correlation_per_group()
    correlation_per_group_fixed_number()
    correlation_mixtools(2) # two components
    #correlation_mixtools(3)    


# exclude edges as suggested by Phil Wigge.
# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
#os.system(cmd)

# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict) 
# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
#     write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
#     time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname
#     cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET)  # put process to background
#     os.system(cmd)
#     time.sleep(60)


# remove .R files in ../Data/temp. Files older than 3 days will be removed
cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR)
os.system(cmd)

# update time stamp file
record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)

write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)