summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2024-08-06 16:06:47 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2024-08-06 16:06:47 +0800
commit173161a747ab2abb4a0d2a7dddecb39ddc9a4589 (patch)
treeb911ac05f48ea815bd4ae700e7216da126bbe14b
parent2f1b7bc8d9542e3d65007b6658691ae9e5c5e870 (diff)
update_network.py: delete the 'merge edge' stuff because this is the job of update_network_by_force.py
-rwxr-xr-xCode/#update_network.py#828
l---------Code/.#update_network.py1
-rwxr-xr-xCode/update_network.py74
3 files changed, 0 insertions, 903 deletions
diff --git a/Code/#update_network.py# b/Code/#update_network.py#
deleted file mode 100755
index 4e46fdc..0000000
--- a/Code/#update_network.py#
+++ /dev/null
@@ -1,828 +0,0 @@
-#! /usr/bin/python3
-# Usage: python3 update_network.py
-# Put this script under directory Code/.
-# IMPORTANT: Run this script under directory Code/.
-# Execute the above command regularly, or
-# Cron job this command to make it run everyday at 5am:
-#
-# 1. crontab -e.
-# 2. Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py
-#
-# IMPORTANT: Make sure that you execute this script (update_network.py) under the directory Code.
-#
-# Purpose: periodically (e.g., per week) run this script to see if the network needs update. If yes, update it.
-#
-# Set HOLDON=NO in parameter_for_buildCmatrix.txt,
-# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make
-# changes in these file effective.
-#
-# parameter_for_buildRmatrix.txt will be updated automatically (I
-# hope). However, we need to update parameter_for_buildCmatrix.txt
-# manually.
-#
-# Revision history:
-#
-# Last modified: 26 Feb 2017
-# Last modified: 17 Mar 2017
-# Last modified: 04 Apr 2017
-# Last modified: 05 Apr 2017
-# Last modified: 10 Apr 2017
-# Last modified: 19 Apr 2017
-# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R]
-# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R]
-# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ]
-# Last modified: 23 Jan 2018 [edited a few print-out messages]
-# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement]
-# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)]
-# Last modified: 03 Feb 2019
-# Last modified: 08 Aug 2019, hui
-# Last modified: 10 Aug 2019, hui <lanhui@zjnu.edu.cn>
-# Last modified: 23 Aug 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools(num_component)]
-# Last modified: 10 Sep 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools, check the previous R session has finished before starting a new one.]
-
-import os, sys
-import numpy as np
-import glob
-import time
-import subprocess
-from datetime import datetime
-from param4net import make_global_param_dict, get_key_value
-from log import write_log_file
-from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \
- PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \
- PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \
- BINDING_FILE, TPM_FILE, \
- BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \
- RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \
- MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \
- EDGE_POOL_DIR, MERGED_EDGE_FILE, \
- TARGET_TF_FILE
-
-
-
-## Helper functions
-
-def get_value(s, delimit):
- lst = s.split(delimit, 1) # only split at the first delimit
- return lst[1].strip()
-
-
-def validate_webapp_dir(para_for_net):
- ''' Make sure this function is executed under the directory Code. '''
- glb_param_dict = make_global_param_dict(para_for_net)
- # if genes.json is not present, create one
- if not os.path.exists('../Webapp/static/json/genes.json'):
- print('[update_network.py]: cannot find genes.json, make one ...')
- cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
- os.system(cmd)
-
-
-def make_paths(s):
- if not os.path.isdir(s):
- os.makedirs(s)
-
-
-def make_important_dirs():
- make_paths('../Data/history/edges/many_targets')
- make_paths('../Data/history/edges/one_target')
- make_paths('../Data/log')
- make_paths('../Data/information')
- make_paths('../Data/temp')
- make_paths('../Data/upload')
- make_paths('../Data/parameter')
- make_paths('../Data/R/Mapped')
- make_paths('../Data/R/Mapped/public')
- make_paths('../Data/R/Mapped/inhouse')
- make_paths('../Data/R/Mapped/other')
- make_paths('../Data/R/Raw')
- make_paths('../Data/C/Mapped')
- make_paths('../Data/C/Raw')
- make_paths('../Data/history/edges')
- make_paths(EDGE_POOL_DIR)
- make_paths('../Data/history/bind')
- make_paths('../Data/history/expr')
- make_paths('../Webapp/static/json')
- make_paths('../Webapp/static/edges')
- make_paths('../Webapp/templates')
-
-
-def num_line(fname):
- ''' Return number of lines in file fname. '''
- if not os.path.exists(fname):
- return 0
- f = open(fname)
- lines = f.readlines()
- f.close()
- return len(lines)
-
-
-def num_ids(fname):
- ''' Return number of IDs in fname. '''
- f = open(fname)
- lines = f.readlines()
- f.close()
- return len(lines[0].split('\t')) - 1
-
-
-def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
- if not os.path.exists(sample_size_file):
- f = open(sample_size_file, 'w')
- else:
- f = open(sample_size_file, 'a')
- f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
- f.close()
-
-
-def age_of_file_in_days(fname):
- ''' Return age of fname in days. '''
- st = os.stat(fname)
- days = (time.time() - st.st_mtime)/(3600*24.0)
- return days
-
-
-def age_of_file_in_seconds(fname):
- ''' Return age of fname in days. '''
- st = os.stat(fname)
- seconds = time.time() - st.st_mtime
- return seconds
-
-
-def hold_on(fname):
- f = open(fname)
- lines = f.readlines()
- f.close()
- for line in lines[:100]: # check the first 100 lines for HOLDON
- line = line.strip()
- if line.startswith('%%HOLDON=YES'):
- return True
- return False
-
-
-def all_files_present(lst):
- missing_file_lst = []
- for path in lst: # lst is a list of file names to check
- if not os.path.exists(path):
- if 'edges.txt' in path:
- write_log_file('[update_network.py] WARNING: must have %s to update network. Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE)
- missing_file_lst.append(path)
- return missing_file_lst
-
-
-def record_file_time(lst, fname):
- '''
- lst - a list of files
- fname - a recorder file
- '''
- f = open(fname, 'w')
- s = ''
- for x in lst:
- if os.path.exists(x):
- s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
- else:
- s += '%s\t%d\n' % (os.path.basename(x), 0)
- f.write(s)
- f.close()
-
-
-def read_file_timestamp(ftimestamp):
- d = {}
- f = open(ftimestamp)
- for line in f:
- line = line.strip()
- lst = line.split()
- fname = lst[0]
- t = lst[1]
- d[fname] = int(t)
-
- f.close()
- return d
-
-
-def file_updated(fname, d):
- ft = int(os.stat(fname).st_mtime)
- k = os.path.basename(fname)
- return ft > d[k]
-
-
-def get_updated_files(lst, d):
- result = []
- for x in lst:
- if file_updated(x, d):
- result.append(os.path.basename(x))
- return result
-
-
-def get_sample_size(d, sorted_keys, day, rcond_string):
-
- if rcond_string.isdigit():
- return int(rcond_string)
-
- if len(d) == 0:
- return 1200 # a default number of sample size, CHANGE
-
- for x in sorted_keys:
- if x >= day:
- return d[x]
-
- k = sorted_keys[-1] # last key, latest date
- return d[k]
-
-
-def number_rnaseq_id(tpm_file):
- f = open(tpm_file)
- first_line = f.readlines()[0]
- f.close()
- first_line = first_line.strip()
- return len(first_line.split()) - 1
-
-
-def number_rnaseq_diff(para_file, tpm_file):
- ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
- a = 0
- f = open(para_file)
- for line in f:
- line = line.strip()
- if line.startswith('@'):
- a += 1
- f.close()
-
- b = number_rnaseq_id(tpm_file)
-
- return a - b
-
-
-def validate_gene_file(fname):
- f = open(fname)
- lines = f.readlines()
- f.close()
- for line in lines: # check all lines
- line = line.strip()
- lst = line.split('\t')
- if len(lst) < 6:
- print('[update_network.py]:Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
- sys.exit()
-
-
-def validate_parameter_for_buildcmatrix(fname):
- # first the file must exist
- if not os.path.exists(fname):
- print('[update_network.py]:CANNOT FIND %s.' % (fname))
- sys.exit()
- f = open(fname)
- lines = f.readlines()
- f.close()
- d = {}
- location_count = 0
- for line in lines:
- line = line.strip()
- if line.startswith('%%'):
- k, v = get_key_value(line[2:])
- d[k] = v
- if k == 'GENE_FILE' or k == 'CHR_INFO':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'GENE_FILE':
- validate_gene_file(v)
- if k == 'DESTINATION':
- if not os.path.isdir(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'TARGET_RANGE':
- if int(v) <= 0:
- print('[update_network.py]:Target range (%d) must be greater than 0.' % (v))
- sys.exit()
- if line.startswith('LOCATION:'):
- v = get_value(line, ':')
- location_count += 1
- if not os.path.exists(v):
- print('[Warning] update_network.py: Location %s does not exists.' % (v))
- #sys.exit()
-
- if not 'GENE_FILE' in d:
- print('[update_network.py]:Must specify GENE_FILE.')
- sys.exit()
- if not 'DESTINATION' in d:
- print('[update_network.py]:Must specify DESTINATION.')
- sys.exit()
- if not 'CHR_INFO' in d:
- print('[update_network.py]:Must specify CHR_INFO.')
- sys.exit()
- if location_count == 0:
- print('[update_network.py]:Must contain at least one ChIP-seq.')
- sys.exit()
-
-
-def validate_parameter_for_buildrmatrix(fname):
- # first the file must exist
- if not os.path.exists(fname):
- print('[update_network.py]:CANNOT FIND %s.' % (fname))
- sys.exit()
- f = open(fname)
- lines = f.readlines()
- f.close()
- d = {}
- location_count = 0
- for line in lines:
- line = line.strip()
- if line.startswith('%%'):
- k, v = get_key_value(line[2:])
- d[k] = v
- if k == 'GENE_LIST':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if line.startswith('LOCATION:'):
- v = get_value(line, ':')
- location_count += 1
- if not os.path.exists(v):
- print('[update_network.py]:Location %s does not exists.' % (v))
- #sys.exit()
-
- if not 'GENE_LIST' in d:
- print('[update_network.py]:Must specify GENE_LIST.')
- sys.exit()
- if location_count == 0:
- print('[update_network.py]:Must contain at least one RNA-seq.')
- sys.exit()
-
-
-def validate_parameter_for_net(fname):
- # first the file must exist
- if not os.path.exists(fname):
- print('[update_network.py]:CANNOT FIND %s.' % (fname))
- sys.exit()
- f = open(fname)
- lines = f.readlines()
- f.close()
- d = {}
- location_count = 0
- for line in lines:
- line = line.strip()
- if line.startswith('%%'):
- k, v = get_key_value(line[2:])
- d[k] = v
- if k == 'GENE_LIST':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'GENE_ID_AND_GENE_NAME':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'BINDING_INFO':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'EXPRESSION_INFO':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- sys.exit()
- if k == 'BINDING_MATRIX':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
- if k == 'EXPRESSION_MATRIX':
- if not os.path.exists(v):
- print('[update_network.py]:%s not exists.' % (v))
- print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
-
- if not 'GENE_LIST' in d:
- print('[update_network.py]:Must specify GENE_FILE.')
- sys.exit()
- if not 'GENE_ID_AND_GENE_NAME' in d:
- print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.')
- sys.exit()
- if not 'BINDING_INFO' in d:
- print('[update_network.py]:Must specify BINDING_INFO.')
- sys.exit()
- if not 'EXPRESSION_INFO' in d:
- print('[update_network.py]:Must specify EXPRESSION_INFO.')
- sys.exit()
- if not 'BINDING_MATRIX' in d:
- print('[update_network.py]:%s not exists.' % (v))
- print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
- if not 'EXPRESSION_MATRIX' in d:
- print('[update_network.py]:%s not exists.' % (v))
- print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
-
-
-
-def need_update_parameter_file(param_file, dirs):
- ''' Make sure param_file is consistent with dirs (a list of directories to check against). '''
- result = []
-
- files_in_parameter = {}
- f = open(param_file)
- for line in f:
- line = line.strip()
- if line.startswith('LOCATION:'):
- lst = line.split(':')
- k = os.path.abspath(lst[1])
- files_in_parameter[k] = 1
- f.close()
- param_modification_time = os.path.getmtime(param_file)
-
- files_in_dirs = {}
- for directory in dirs:
- for root, dirnames, filenames in os.walk(os.path.abspath(directory)):
- for filename in filenames:
- k = os.path.join(root, filename)
- files_in_dirs[k] = 1
- if 'narrowPeak' in k or '_quant' in k:
- if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time:
- result.append('%s is not in %s' % (k, param_file))
-
- return result
-
-
-
-def validate_binding_file(fname):
- f = open(fname)
- lines = f.readlines()
- for line in lines:
- line = line.strip()
- if 'buildCmatrix: ChIP-seq ID list is empty.' in line:
- return False
- f.close()
- return True
-
-
-def lines_with_10_fields(s):
- result = []
- for line in s.split('\n'):
- line = line.strip()
- if len(line.split('\t')) == 10:
- result.append(line)
- return result
-
-
-def concatenate_edge_files(fname_lst, fname_out):
- fout = open(fname_out, 'w')
- for fname in fname_lst:
- f = open(fname)
- s = f.read()
- f.close()
- # Make sure each edge has 10 fields before writing.
- lines = lines_with_10_fields(s)
- if lines != []:
- write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
- fout.write('\n'.join(lines) + '\n')
- else:
- write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
- fout.close()
-
-
-def delete_edge_files(fname_lst):
- for fname in fname_lst:
- # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
- if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes
- os.remove(fname)
- else:
- write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE)
-
-
-def create_edges0():
- if os.path.exists(PARAMETER_FOR_NET):
- write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET)
- os.system(cmd)
-
- # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables.
- if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS):
- #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS)
- #os.system(cmd)
-
- if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP):
- #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP)
- #os.system(cmd)
-
- if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT):
- #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT)
- #os.system(cmd)
-
- if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL):
- #write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL)
- #os.system(cmd)
-
-
-def create_edges0B():
- if os.path.exists(PARAMETER_FOR_NET):
- write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
- os.system(cmd)
-
-
-def wedge():
- if os.path.exists(PARAMETER_FOR_NET):
- write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE)
- cmd = 'Rscript wedge.R'
- os.system(cmd)
-
-
-def correlation_per_group():
- # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours.
- if os.path.exists(PARAMETER_FOR_NET):
- write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
- cmd = 'Rscript correlation_per_group.R'
- os.system(cmd)
-
-
-def correlation_per_group_fixed_number():
- if os.path.exists(PARAMETER_FOR_NET):
- write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
- cmd = 'Rscript correlation_per_group_fixed_number.R'
- os.system(cmd)
-
-
-def correlation_mixtools(num_component):
- if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one. os.system returns 0 if R is running.
- write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d). create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE)
- cmd = 'Rscript create_edges_mixtool.R %d' % (num_component)
- os.system(cmd)
-
-
-def check_rnaseq_info():
- # check rnaseq_info_database.txt and rnaseq_info_database.json. If they are outdated, then remind us to update it in log file.
- if os.path.exists(RNA_SEQ_INFO_DATABASE):
- if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 120: # older than 120 days
- write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE)
- else:
- write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE)
-
- if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
- if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 120:
- write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE)
- else:
- write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE)
-
-
-# main
-FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \
- MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of important files
-
-make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR)
-#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json.
-
-check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots
-
-# Make sure all necessary files are present, if not, make them if possible
-miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
-if miss_lst != []: # miss_lst is non-empty in the beginning.
- print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst)))
- write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE)
-
- # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ...
- important_miss_number = 0
- if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
- print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX))
- important_miss_number += 1
-
- if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
- print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX))
- important_miss_number += 1
-
- if PARAMETER_FOR_NET in miss_lst:
- print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET))
- important_miss_number += 1
-
- if important_miss_number > 0:
- sys.exit() # need to provide all the above three files; otherwise cannot proceed
-
- if BINDING_FILE in miss_lst:
- print('[update_network.py]: make initial binding.txt ... wait')
- write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
- #os.system(cmd)
- cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
- #os.system(cmd)
- print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
- sys.exit()
-
- if TPM_FILE in miss_lst:
- print('[update_network.py]: make initial TPM.txt ... wait')
- write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt
- os.system(cmd)
- print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
- sys.exit()
-
- miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
- if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one.
- print('[update_network.py]: make initial edges.txt ... wait')
- create_edgeds0()
-
-
-# Make json2 (sliced binding.txt) if it does not exist. Copy json2 to
-# the web application folder static/edges [do it manually] for displaying
-# binding strength plots.
-if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE):
- write_log_file('Make directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
- os.system(cmd)
-
-
-# Make json (sliced TPM.txt) if it does not exist. Copy json to the
-# web application folder static/edges [manual] for displaying gene
-# expression scatterplots.
-if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE):
- write_log_file('Make directory ../Data/history/expr/json. Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
- os.system(cmd)
-
-
-# Make sure parameter files are present and valid (rudimentary check but important)
-validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
-validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
-validate_parameter_for_net(PARAMETER_FOR_NET)
-
-
-# If the file timestamp does not exist, create one
-if not os.path.exists(FILE_TIMESTAMP):
- record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
-
-# Get update time of mandatory files
-timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)
-
-
-
-################## binding.txt stuff #####################################
-# Check parameter_for_buildCmatrix.txt
-updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
-if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX):
- write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE)
- write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated
- os.system(cmd)
-
-
-updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
-if 'binding.txt' in updated_file_list:
- write_log_file('[update_network.py] binding.txt has been updated. This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE)
- # create_edges0()
- # create_edges0B()
- # wedge()
- # correlation_per_group()
- # correlation_per_group_fixed_number()
- # correlation_mixtools(2)
- # correlation_mixtools(3)
-
- ## TODO mixtool stuff, forget it for now.
- #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name)
- #os.system(cmd)
-
-
-
-
-################## TPM.txt stuff #####################################
-
-# update parameter_for_buildRmatrix.txt periodically and automatically.
-if datetime.now().day % BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly
- curr_time = datetime.now().strftime('%Y%m%d%H%M')
- new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time)
- cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished.
- os.system(cmd)
- num = number_rnaseq_diff(new_parameter_file, TPM_FILE)
- if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added
- write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
- cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX)
- os.system(cmd)
- else:
- write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt. TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE)
-
-
-# Check if parameter_for_buildRmatrix.txt has been updated
-updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
-# TODO To simplify things, I will provide TPM.txt directly. So set the
-# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent
-# the following from being True.
-if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX):
- write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
- write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE)
- curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
- if os.path.exists(TPM_FILE):
- backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time)
- cmd = 'cp %s %s' % (TPM_FILE, backup_file_name)
- os.system(cmd)
- cmd = 'gzip %s' % (backup_file_name)
- os.system(cmd)
-
- cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, whose location is specified in TPM_TABLE in buidlRmatrix.py
- os.system(cmd)
-
- curr_date = datetime.now().strftime('%Y%m%d')
- tpm_sample_size = number_rnaseq_id(TPM_FILE)
- write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size)
-
-
-
-# Create edges using all RNA-seq experiments
-updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
-if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent. We will recompute edges using the full binding.txt.
- # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges
- write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs. Pay attention to include-all in the command-line argument.
- os.system(cmd)
-
- # target_tf.txt
- write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
- os.system(cmd)
-
- write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt. Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE)
- ## json -- make/renew json directory for displaying scatterplots
- cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
- ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots
- write_log_file('[update_network.py] Update directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
- #os.system(cmd) # turn this on if we are going to use this binding.txt for displaying bar charts of binding strengths
- ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual]
-
- if False: # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster.
- write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE)
- os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead. The export command can be put in crontab -e before running this script
- cmd = 'python3 assign_tissue.py'
- os.system(cmd)
- cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt'
- os.system(cmd)
- cmd = 'python3 update_rnaseq_info_json.py'
- os.system(cmd)
-
-
-
- # Compute edges. This could take a lot of time so update FILE_TIMESTAMP first.
- record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
- create_edges0()
- create_edges0B()
- wedge()
- correlation_per_group()
- correlation_per_group_fixed_number()
- correlation_mixtools(2) # two components
- #correlation_mixtools(3)
-
-
-########## Merge edges #######################
-# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
-time.sleep(5)
-edge_file_lst = [] # collect edge files.
-most_recent_edge_modification_time = 0
-write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
-for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
- edge_file_lst.append(fname)
- if os.path.getmtime(fname) > most_recent_edge_modification_time:
- most_recent_edge_modification_time = os.path.getmtime(fname)
-
-write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
-for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
- edge_file_lst.append(fname)
- if os.path.getmtime(fname) > most_recent_edge_modification_time:
- most_recent_edge_modification_time = os.path.getmtime(fname)
-
-
-if edge_file_lst == []:
- write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
-elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
- # concatenate edge files into one
- write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
- curr_time = datetime.now().strftime('%Y%m%d_%H%M')
- concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
- delete_edge_files(edge_file_lst)
-
-if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
- write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
- write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 merge_edges.py'
- os.system(cmd)
- write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges'
- write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
-
-
-# exclude edges as suggested by Phil Wigge.
-# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
-# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
-#os.system(cmd)
-
-# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
-# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
-# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
-# write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
-# time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname
-# cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET) # put process to background
-# os.system(cmd)
-# time.sleep(60)
-
-
-# remove .R files in ../Data/temp. Files older than 3 days will be removed
-cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR)
-os.system(cmd)
-
-# update time stamp file
-record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
-
-write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
-
diff --git a/Code/.#update_network.py b/Code/.#update_network.py
deleted file mode 120000
index 66ec7a4..0000000
--- a/Code/.#update_network.py
+++ /dev/null
@@ -1 +0,0 @@
-lanhui@VM-0-14-ubuntu.13592:1688234417 \ No newline at end of file
diff --git a/Code/update_network.py b/Code/update_network.py
index d5aed9b..7b26f58 100755
--- a/Code/update_network.py
+++ b/Code/update_network.py
@@ -436,7 +436,6 @@ def need_update_parameter_file(param_file, dirs):
return result
-
def validate_binding_file(fname):
f = open(fname)
lines = f.readlines()
@@ -448,40 +447,6 @@ def validate_binding_file(fname):
return True
-def lines_with_10_fields(s):
- result = []
- for line in s.split('\n'):
- line = line.strip()
- if len(line.split('\t')) == 10:
- result.append(line)
- return result
-
-
-def concatenate_edge_files(fname_lst, fname_out):
- fout = open(fname_out, 'w')
- for fname in fname_lst:
- f = open(fname)
- s = f.read()
- f.close()
- # Make sure each edge has 10 fields before writing.
- lines = lines_with_10_fields(s)
- if lines != []:
- write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
- fout.write('\n'.join(lines) + '\n')
- else:
- write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
- fout.close()
-
-
-def delete_edge_files(fname_lst):
- for fname in fname_lst:
- # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
- if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes
- os.remove(fname)
- else:
- write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE)
-
-
def create_edges0():
if os.path.exists(PARAMETER_FOR_NET):
write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
@@ -765,49 +730,11 @@ if 'TPM.txt' in updated_file_list: # we could _touch_ TPM.txt to make it recent.
#correlation_mixtools(3)
-########## Merge edges #######################
-# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
-time.sleep(5)
-edge_file_lst = [] # collect edge files.
-most_recent_edge_modification_time = 0
-write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
-for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
- edge_file_lst.append(fname)
- if os.path.getmtime(fname) > most_recent_edge_modification_time:
- most_recent_edge_modification_time = os.path.getmtime(fname)
-
-write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
-for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
- edge_file_lst.append(fname)
- if os.path.getmtime(fname) > most_recent_edge_modification_time:
- most_recent_edge_modification_time = os.path.getmtime(fname)
-
-
-if edge_file_lst == []:
- write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
-elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
- # concatenate edge files into one
- write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
- curr_time = datetime.now().strftime('%Y%m%d_%H%M')
- concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
- delete_edge_files(edge_file_lst)
-
-if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
- write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
- write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- cmd = 'python3 merge_edges.py'
- os.system(cmd)
- write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
- manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo cp /home/lanhui/brain/Data/temp/html_edges/edges.sqlite /var/www/brain/brain/static/edges'
- write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
-
-
# exclude edges as suggested by Phil Wigge.
# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
#os.system(cmd)
-
# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
@@ -826,4 +753,3 @@ os.system(cmd)
record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
-