\n'
+ f.write(s)
+ f.write('')
+ f.close()
+
+
+def make_link_string_for_cond(s, type):
+ ''' s is a string of RNA-seq IDs or ChIP IDs. '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ if type == 'rnaseq':
+ path = '%s#%s' % (RNA_SEQ_INFO_HTML_PAGE, x)
+ else:
+ path = '%s#%s' % (CHIP_SEQ_INFO_HTML_PAGE, x)
+ result += '%s ' % (path, x)
+ return result
+
+
+def get_chip_signal(s, d):
+ ''' extract signal information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('SIGNAL='):
+ result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
+ break
+ return word_freq(result)
+
+
+def get_chip_phenotype(s, d):
+ ''' extract phenotype information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('PHENOTYPE='):
+ result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE='
+ break
+ return word_freq(result)
+
+
+def word_freq(s): # for ChIP-seq data
+ ''' s is string. return a string of words ordered by frequency '''
+ if s == '':
+ return ''
+
+ lst = s.split(';')
+ d = {}
+ for x in lst:
+ lst2 = x.split()
+ for y in lst2:
+ #k = y.lower()
+ k = y
+ k = k.strip(',')
+ k = k.strip('.')
+ k = k.strip(')')
+ k = k.strip('(')
+ if not k.lower() in ['at', 'in', 'to', 'with', ',', '.', ':', '-']: # exclude these words
+ if not k in d:
+ d[k] = 1
+ else:
+ d[k] += 1
+
+ sorted_tuples = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ first_items = [x[0] for x in sorted_tuples]
+ return ' '.join(first_items)
+
+
+def word_freq2(lst): # for RNA-seq data
+ ''' s is string. return a string of words ordered by frequency '''
+
+ if lst == []:
+ return ''
+
+ d = {}
+ for x in lst: # each description
+ lst2 = x.split()
+ for y in lst2: # each word
+ k = y
+ k = k.strip(',') # remove superfluous charaters, if any
+ k = k.strip('.')
+ k = k.strip(')')
+ k = k.strip('(')
+ k = k.strip(';')
+ if not k.startswith('SRR') and not k.startswith('ERR') and not k.startswith('DRR') and not k.isdigit() and not ':' in k and len(k) > 1 and not k.lower() in ['just', 'library', 'libraries', 'dna', 'nextseq', 'nextseq500', 'sequencing', 'end', 'al;', 'which', 'analyse', 'analyze', 'analyzer', 'whole-genome', 'thus', 'plant', 'plants', 'future', 'such', 'not', 'alone', 'most', 'within', 'into', 'but', 'between', 'we', 'is', 'or', 'also', 'was', 'can', 'be', 'use', 'kit', 'used', 'et', 'al', 'by', 'this', 'the', 'their', 'at', 'in', 'to', 'on', 'with', ',', '.', ':', '-', 'rna-seq', 'rnaseq', 'of', 'hiseq', 'hiseq2000', 'illumina', 'arabidopsis', 'thaliana', 'from', '
[title]', '
[description]', 'using', 'were', 'are', 'and', 'under', 'a', 'an', 'one', 'two', 'three', 'as', 'for', 'after', 'none', 'mapping', 'na', 'whole', 'chip-seq', 'paired']: # exclude these strings
+ if not k in d:
+ d[k] = 1
+ else:
+ d[k] += 1
+
+ sorted_tuples = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ first_items = [x[0] + ' (' + str(x[1]) + ')' for x in sorted_tuples]
+ return ' '.join(first_items)
+
+
+def word_freq3(lst): # for RNA-seq data, bag-of-words model
+ ''' similar to word_freq2, but may be faster '''
+ if lst == []:
+ return ''
+
+ bow = [collections.Counter(re.findall(r'\w+', s)) for s in lst] # bag of words
+ d = sum(bow, collections.Counter()) # frequency of each word
+ sorted_tuples = d.most_common(len(d))
+ exclude_lst = ['basis', 'requires', 'population', 'resolution', 'via', 'overall', 'elements', 'grown', 'expression', 'appears', 'total', 'have', 'here', 'of', 'just', 'type', 'transcriptomes', 'transcriptome', 'transcriptomic', 'transcription', 'transcriptional', 'report', 'during', 'diversity', 'investigated', 'library', 'per', 'libraries', '2500', '2000', '1210', '1001', '1107', 'dna', 'nextseq', 'nextseq500', 'seq', 'sequencing', 'sequencing;', 'end', 'al;', 'whereas', 'which', 'analyse', 'analyze', 'analyzer', 'quality', 'analysis', 'analyses', 'whole-genome', 'thus', 'plant', 'plants', 'future', 'such', 'not', 'alone', 'most', 'molecular', 'within', 'into', 'but', 'however', 'between', 'we', 'is', 'origin', 'or', 'also', 'was', 'can', 'be', 'been', 'use', 'kit', 'used', 'et', 'al', 'by', 'this', 'that', 'these', 'the', 'their', 'at', 'in', 'to', 'on', 'with', 'mrna', 'rna', 'rnas', 'rna-seq', 'rnaseq', 'of', 'hiseq', 'hiseq2000', 'illumina', 'arabidopsis', 'thaliana', 'from', 'roles', 'title', 'description', 'using', 'were', 'are', 'and', 'unknown', 'under', 'a', 'an', 'one', 'two', 'three', 'as', 'for', 'found', 'after', 'none', 'mapping', 'na', 'whole', 'chip-seq', 'play', 'paired', 'br', 'future', 'rowan', 'study', 'studies', 'may', 'sample', 'truseq', 'until', 'gene', 'genes', 'genetic', 'genome', 'genomes', 'units', 'its', 'yelina', 'data', 'set', 'tube', 'single-base', 'size', 'room', 'along', 'before', 'several', 'less', 'protocol', 'profiling', 'profiles', 'conditions', 'collection', 'complete', 'reveal', 'given', 'ii', 'isolated', 'described', 'describe', 'na', 'worldwide', 'accessions', 'identify', 'identification'] # exclude these words
+ first_items = [x[0] + ' (' + str(x[1]) + ')' for x in sorted_tuples if x[1] > 2 and len(x[0]) > 1 and not x[0].startswith('SRR') and not x[0].startswith('ERR') and not x[0].startswith('DRR') and not x[0].isdigit() and not ':' in x[0] and not x[0].lower() in exclude_lst]
+ return ' '.join(first_items)
+
+
+def get_rna_signal(s, d):
+ ''' extract RNA-seq signal information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = []
+ MAX_WORDS = 60
+ if lst[0] == '.': # all RNA samples
+ return 'all available signals'
+ for x in lst: # x is an RNA sample ID, words by frequency
+ if x in d:
+ desc = d[x]['description']
+ desc_lst = re.split(' ', desc)
+ short_lst = []
+ for x in desc_lst:
+ short_lst.extend(x.split())
+ if len(short_lst) > MAX_WORDS: # average english words 5.1, take the first 100 words, should be informative enough. Longer desc require more computation time.
+ short_lst = short_lst[:MAX_WORDS]
+ break
+ # index = desc.find(' ')
+ # if index > 0:
+ # desc = desc[:index]
+ result.append((' '.join(short_lst)).strip())
+ return word_freq3(result)
+
+
+def get_rna_signal2(s, d): # not very successful, and slow, so NOT used
+ ''' extract RNA-seq signal information, and return the words ordered by frequency '''
+
+ lst = s.split()
+
+ if lst[0] == '.': # all RNA samples
+ return 'all available signals'
+
+ text = ''
+ for x in lst: # x is an RNA sample ID, words by frequency
+ if x in d:
+ desc = d[x]['description']
+ text += desc.strip().rstrip('.') + '. '
+
+ rake = Rake(RAKE_STOPLIST_FILE)
+ keywords = rake.run(text)
+ return ' '.join( [ t[0] + ' (' + str(int(t[1])) + ')' for t in keywords ] )
+
+
+def replace_old_html_page(fname, edge_date):
+ ''' If the file fname needs updating, return True. '''
+ if not os.path.exists(fname): # if the file does not exist, it needs updating
+ return True
+
+ # Check all files AT2G43790_AT1G03080_0.html, AT2G43790_AT1G03080_1.html, AT2G43790_AT1G03080_2.html, etc. If any of them is too old, create a new one.
+ index = fname.rfind('_')
+ if index < 0:
+ print('html_network.py: %s has no underscore.' % (fname))
+ sys.exit()
+ fname_part = fname[:index]
+ for fn in glob.glob(os.path.join(fname_part, '*.html')):
+ file_date = datetime.fromtimestamp(os.path.getmtime(fn)).strftime('%Y%m%d')
+ if int(edge_date) - int(file_date) > 1: # edge_date is at least 1 day newer than edge file date
+ return True
+
+ return False
+
+
+def format_date(s):
+ ''' s in the form of 20170419. Return 2017-04-19 '''
+ s = s.strip()
+ if len(s) != 8:
+ return s
+ return s[0:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def make_html_page_for_condition(fname, tf_name, target_name, condRstr, condCstr, edge_date, subset): # important page ***
+
+ ### if the page already exists, and its information is up-to-date, then don't create it again (to save time)
+ if REGENERATE_ALL_EDGE_FILES == 'NO' and not replace_old_html_page(fname, edge_date):
+ return
+
+ d3_library = ''
+ f = open(fname, 'w')
+ f.write(' %s ' % (d3_library))
+
+ ### RNA-seq
+ f.write('
RNA-seq experiments
')
+ part = os.path.splitext( os.path.basename(fname) )[0] # get file name without extension
+ id_lst = part.split('_')
+ gene1_file = os.path.join('json', id_lst[0] + '.json') # TF
+ gene2_file = os.path.join('json', id_lst[1] + '.json') # target
+
+ f.write('
TF is %s %s. Target is %s %s. Edge made on %s. Method: %s.
'% (id_lst[0], '' if tf_name == id_lst[0] else tf_name, id_lst[1], '' if target_name == id_lst[1] else target_name, format_date(edge_date), subset))
+ cond_lst_str = str(condRstr.split()) # insert to javascript function call code
+ rnaseq_info_file = os.path.basename(RNA_SEQ_INFO_DATABASE_JSON)
+ s = '
'
+
+ make_html_page(n, G, filepath, agi2name_dict)
+
+findex.write(s)
+findex.write('')
+findex.close()
+
+# copy auxiliary folders and files
+if os.path.isdir(JSON_DIR):
+ cmd = 'cp -r %s %s' % (JSON_DIR, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: Omit JSON directory (for displaying gene expression).')
+
+if os.path.isdir(JSON_DIR2):
+ cmd = 'cp -r %s %s' % (JSON_DIR2, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: Omit JSON directory 2 (for displaying binding).')
+
+if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ cmd = 'cp %s %s' % (RNA_SEQ_INFO_DATABASE_JSON, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: %s does not exists. Scatterplots may not work properly.' % (RNA_SEQ_INFO_DATABASE_JSON))
+
+for fname in C3_FILES:
+ fpath = os.path.join(C3_DIR, fname)
+ if os.path.exists(fpath):
+ cmd = 'cp %s %s' % (fpath, DIR_NAME)
+ os.system(cmd)
+ else:
+ print('[WARNING] html_network.py: Omitted %s. Scatter plot may not work without this file. ' % (fpath))
+
+for fname in W2UI_FILES:
+ fpath = os.path.join(W2UI_DIR, fname)
+ if os.path.exists(fpath):
+ cmd = 'cp %s %s' % (fpath, DIR_NAME)
+ os.system(cmd)
+ else:
+ print('[WARNING] html_network.py: Omit %s. Table may not work without this file. ' % (fpath))
+
+#print('html_network.py done!')
diff --git a/Code/json_test.py b/Code/json_test.py
new file mode 100644
index 0000000..c4ca7bd
--- /dev/null
+++ b/Code/json_test.py
@@ -0,0 +1,8 @@
+import json
+old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
+with open(old_json) as json_data:
+ json_dict = json.load(json_data)
+ for k in json_dict:
+ print(k)
+ # if k in tissue_dict:
+ # json_dict[k]['tissue'] = tissue_dict[k]
diff --git a/Code/knn_classify.R b/Code/knn_classify.R
new file mode 100644
index 0000000..46df992
--- /dev/null
+++ b/Code/knn_classify.R
@@ -0,0 +1,79 @@
+## Usage: change file names in the section # Paramters
+## Purpose: Classify tissues using KNN. Use tSNE to reduce the dimensionality of each tissue to 2 first.
+##
+## 7 June 2017, slcu, hui
+## Last modified 20 June 2017, slcu, hui
+
+# Parameters
+TRAIN_DATA_FILE <- '../Data/history/expr/TPM.txt'
+TRAIN_CLASS_FILE <- '../Data/information/experiment.and.tissue.2.txt'
+
+K <- 1
+PERPLEXITY <- 50 # for tSNE
+
+# Load data
+#cat('Load TPM.txt ...\n')
+X <- read.table(TRAIN_DATA_FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+
+Z <- read.table(TRAIN_CLASS_FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels <- as.vector(Z$suggested.tissue)
+
+unknown.index <- which(labels == "unknown") # remove unknowns
+
+X.2 <- X[, unknown.index] # test data (label unknown)
+labels <- labels[-unknown.index]
+X <- X[, -unknown.index]
+
+
+labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+sul <- sort(unique(labels)) # sorted unique labels
+colors <- rainbow(length(sul))
+names(colors) <- sul
+
+# Filter rows
+#cat('Filter ...\n')
+rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average
+sd.val <- apply(X, 1, sd)
+sd.tau <- summary(sd.val)[3] # genes whose gene expression varies least are to be filtered
+index <- rowSums(X) > rowsum.tau & sd.val > 10
+
+n.train <- dim(X)[2]
+X.3 <- log(cbind(X[index,], X.2[index,]) + 1)
+n.test <- dim(X.2)[2]
+n <- dim(X.3)[2]
+
+# Learn
+library(Rtsne)
+library(class)
+set.seed(100)
+#cat('Dimensionality reduction using tSNE ...\n')
+tsne <- Rtsne(t(X.3), check_duplicates=F, dims=2, perplexity=PERPLEXITY, theta=0.5, verbose=FALSE, max_iter=600) # dimensionality reduction
+train.data <- cbind(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2])
+
+# Train and test on the same data
+cl <- factor(labels) # class labels
+result <- knn(train.data, train.data, cl, k=K, prob=TRUE)
+#cat(sprintf('Training accuracy: %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# Cross-validation on training data
+result <- knn.cv(train.data, cl, k=K, prob = TRUE)
+#cat(sprintf('Test accuracy (leave-one-out cross-validation): %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# If test data is available, make prediction.
+test.data <- cbind(tsne$Y[(n.train+1):n,1], tsne$Y[(n.train+1):n,2])
+result <- knn(train.data, test.data, cl, k=K, prob=TRUE)
+df <- data.frame(sample.name=colnames(X.2), predicted.tissue=result)
+write.table(df, '../Data/temp/predicted.label.txt', quote=F, sep='\t', row.names=F)
+
+# Plot
+#pdf('../Data/temp/fig.pdf')
+#par(mar=c(5.1, 4.1, 4.1, 8.8), xpd=TRUE)
+#plot(tsne$Y, xlab='tsne X', ylab='tsne Y', cex=.4, col='grey')
+#points(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2], col=colors[labels], cex=.4)
+##text(tsne$Y[,1], tsne$Y[,2], labels, cex=0.1)
+#legend("topright", inset=c(-0.4,0), sul, col=colors, pch=16)
+#dev.off()
+
diff --git a/Code/local_network.py b/Code/local_network.py
new file mode 100644
index 0000000..216832f
--- /dev/null
+++ b/Code/local_network.py
@@ -0,0 +1,1114 @@
+# Usage: python local_network.py
+#
+# Put this file under directory Code
+# Prepare a paramter_for_buildCmatrix.txt and put it under dictory Data/parameter
+# Execute the above command regularly.
+#
+# Edit Webapp/start_webapp.py, uncomment app.run(debug=True) and comment out the previous app.run(). To display the network, cd Webpap && python start_webapp.py.
+# Enter http://127.0.0.1:5000 in net browser.
+#
+# Note:
+# Tested on a 32-core server at slcu running Ubuntun. The program may slow down a personal computer.
+# The program will check that the following required packages are installed.
+# Required python packages: numpy, networkx, flask.
+# Required R packages: rjson, mixtools, Rtsne.
+# To install a python package, use the command pip install numpy.
+# To install an R package, issue this command in R: install.packages('Rtsne', dependencies=TRUE, repos='http://cran.rstudio.com/')
+# In a Mac, use export PYTHONPATH="/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages" to make installed python packages usable.
+#
+# Purpose: make a gene regulatory network locally. Periodically (e.g., per day) run this script to see if the network needs update. If yes, update it.
+# You need to prepare and edit parameter_for_buildCmatrix.txt manually.
+#
+# Created 1 July 2017, hui.lan@slcu.cam.ac.uk, slcu
+# Last modified 4 July 2017, hui lan, slcu
+
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+from param4net import make_global_param_dict, get_key_value
+
+FORCE_MAKE_EDGES = 'NO'
+
+CODE_DIR = os.getcwd() # Get current working directory. It is important that you execute this script under it directory.
+
+# DON'T CHANGE THE FOLLOWING PATHS AND NAMES
+HISTORY_DIR = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
+HISTORY_DIR2 = '../Data/history/edges/one_target' # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
+FILE_TIMESTAMP = '../Data/log/file_timestamp.txt'
+SAMPLE_SIZE_FILE = '../Data/log/total.samples.txt' # each line contains a date and the number of samples on and after that date
+TEMP_DIR = '../Data/temp'
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
+PARAMETER_FOR_NET = '../Data/parameter/parameter_for_net.txt'
+EDGE_FILE = '../Data/history/edges/edges.txt'
+BINDING_FILE = '../Data/history/bind/binding.txt'
+TPM_FILE = '../Data/history/expr/TPM.txt'
+LOG_FILE = '../Data/log/update.network.log.txt'
+NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'
+RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt'
+RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
+
+FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX,
+ PARAMETER_FOR_BUILDRMATRIX,
+ PARAMETER_FOR_NET,
+ EDGE_FILE,
+ BINDING_FILE,
+ TPM_FILE]
+
+## help functions
+
+def ok_webapp_dir(para_for_net):
+ ''' we are now under Code '''
+ glb_param_dict = make_global_param_dict(para_for_net)
+ # genes.json is not here, create one
+ if not os.path.exists('../Webapp/static/json/genes.json'):
+ print('genes.json not here, make one')
+ cmd = 'python text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
+ os.system(cmd)
+
+def make_paths(s):
+ if not os.path.isdir(s):
+ os.makedirs(s)
+
+def make_important_dirs():
+ make_paths('../Data/history/edges/many_targets')
+ make_paths('../Data/history/edges/one_target')
+ make_paths('../Data/log')
+ make_paths('../Data/information')
+ make_paths('../Data/temp')
+ make_paths('../Data/parameter')
+ make_paths('../Data/R/Mapped')
+ make_paths('../Data/R/Mapped/public')
+ make_paths('../Data/R/Mapped/inhouse')
+ make_paths('../Data/R/Mapped/other')
+ make_paths('../Data/R/Raw')
+ make_paths('../Data/C/Mapped')
+ make_paths('../Data/C/Mapped/Columns')
+ make_paths('../Data/C/Raw')
+ make_paths('../Data/history/edges')
+ make_paths('../Data/history/bind')
+ make_paths('../Data/history/expr')
+ make_paths('../Webapp/static/json')
+ make_paths('../Webapp/static/edges')
+ make_paths('../Webapp/templates')
+
+def num_line(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+def record_new_or_updated_chip_ids(lst, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y%m%d')
+ for x in lst:
+ f.write('%s\t%s\n' % (x, curr_time))
+ f.close()
+
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ print(s)
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+
+def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
+ if not os.path.exists(sample_size_file):
+ f = open(sample_size_file, 'w')
+ else:
+ f = open(sample_size_file, 'a')
+ f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
+ f.close()
+
+def age_of_file(fname):
+ st = os.stat(fname)
+ days = (time.time() - st.st_mtime)/(3600*24.0)
+ return days
+
+def hold_on(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[:100]: # check the first 100 lines for HOLDON
+ line = line.strip()
+ if line.startswith('%%HOLDON=YES'):
+ return True
+ return False
+
+def all_files_present(lst):
+ missing_file_lst = []
+ for path in lst: # lst is a list of file names to check
+ if not os.path.exists(path):
+ if 'edges.txt' in path:
+ write_log_file('WARNING: must have %s to update network.' % (path), LOG_FILE)
+ missing_file_lst.append(path)
+ return missing_file_lst
+
+def record_file_time(lst):
+ f = open(FILE_TIMESTAMP, 'w')
+ s = ''
+ for x in lst:
+ if os.path.exists(x):
+ s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
+ else:
+ s += '%s\t%d\n' % (os.path.basename(x), 0)
+ f.write(s)
+ f.close()
+
+
+def read_file_timestamp(ftimestamp):
+ d = {}
+ f = open(ftimestamp)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ fname = lst[0]
+ t = lst[1]
+ d[fname] = int(t)
+
+ f.close()
+ return d
+
+def file_updated(fname, d):
+ ft = int(os.stat(fname).st_mtime)
+ k = os.path.basename(fname)
+ return ft > d[k]
+
+
+def get_updated_files(lst, d):
+ result = []
+ for x in lst:
+ if file_updated(x, d):
+ result.append(os.path.basename(x))
+ return result
+
+
+def not_bad_line(s):
+ if s.strip() == '':
+ return False
+ if 'WARNING' in s:
+ return False
+ if 'number' in s:
+ return False
+ if 'Need' in s:
+ return False
+ if 'Error' in s:
+ return False
+ if 'Too' in s:
+ return False
+ if not s.startswith('AT'): # comment out this test if the organism is not Arabidopsis CHANGE
+ return False
+ return True
+
+
+def get_rcond_string(s):
+ s = s.strip()
+ if s.startswith('R0000DRR') or s.startswith('R0000ERR') or s.startswith('R0000SRR'):
+ s = s.replace('R0000DRR', 'R0DRR').replace('R0000SRR', 'R0SRR').replace('R0000ERR', 'R0ERR') # remove extra 0's introduced in earlier edges.txt
+ return s
+
+
+def compute_metric(d):
+ '''
+ d has the form {'freq':0, 'total_RNAseq_ID':0, 'sum_abs_R':0,
+ 'most_recent_date':'20161201'}.
+
+ Metric is a combination of sevaral quantities, average absolute
+ correlation coefficient, average number of RNA-seq data used,
+ frequency of this edge, and recentness. It is used to rank edges.
+ So larger correlation cofficients based on more RNA-seq data that
+ frequenly appear and recent will be ranked on top.
+
+ Formula: avg.abs(r)*log(avg.RN)*log(F+1)*(CurrentDate-MostRecentDate)^-0.2
+
+ '''
+ avg_abs_r = 1.0 * d['sum_abs_R'] / d['freq']
+ log_avg_RN = np.log10(1.0 * d['total_RNAseq_ID'] / d['freq'])
+ log_freq = np.log2(d['freq'] + 1)
+ S = 200.0 # strength of memory, larger value means better memory
+ recentness = np.exp(1.0*(int(d['most_recent_date']) - int(datetime.now().strftime('%Y%m%d')))/S) # a monotonic decreasing function exp(-t/S), exponential nature of forgetting
+ return avg_abs_r * log_avg_RN * log_freq * recentness
+
+
+def make_sample_size_dict(fname):
+ if not os.path.exists(fname):
+ return {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ lst = line.split('\t')
+ d[lst[0]] = int(lst[1])
+ return d, sorted(d.keys())
+
+
+def get_sample_size(d, sorted_keys, day):
+ if len(d) == 0:
+ return 1200 # a default number of sample size, CHANGE
+
+ for x in sorted_keys:
+ if x >= day:
+ return d[x]
+
+ k = sorted_keys[-1] # last key, latest date
+ return d[k]
+
+
+def merge_edges(file_lst, edge_fname, sample_size_file):
+
+ '''
+ Write to edge_fname.
+
+ file_lst -- a list of edges.txt.*, computed using different sets
+ of data. Each edges.txt file has the same format.
+
+ This function merges all edges together. Correlation based on
+ larger number of RNA-seq experiments is favoured. Each edge has
+ the following format: target_gene tf_gene value type
+ RNA_experiments ChIP_experiments loglik date, and metric. metric is newly computed.
+
+ '''
+
+ sample_size_dict, sample_size_keys = make_sample_size_dict(sample_size_file)
+
+ ll_dict = { # loglikehood to tissue or method
+ '-999.0':'seedling',
+ '-998.0':'meristem',
+ '-997.0':'root',
+ '-996.0':'leaf',
+ '-995.0':'flower',
+ '-994.0':'shoot',
+ '-993.0':'seed',
+ '-992.0':'stem',
+ '-990.0':'aerial',
+ '-991.0':'hclust',
+ '-1000.0':'wedge (post.translation.3)',
+ '-1001.0':'wedge (post.translation.4)',
+ '.':'all'
+ }
+
+ d = {} # hold edges, best will be kept for mix pos, mix neg and all.
+ d_mix_pos = {} # for computing rank metric 'freq':0, 'total_RNAseq_ID':0, 'sum_abs_R':0, 'most_recent_date':'20161201'
+ d_mix_neg = {}
+ d_all = {}
+ d_tissue = {} # hold subset information (tissue or method)
+ max_rcond_size = 10
+ for fname in file_lst: # check each edge file
+ if not os.path.exists(fname):
+ write_log_file('%s missing.' % (fname), LOG_FILE)
+ continue
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t') # fields are separated by TABs
+ if not_bad_line(line) and len(lst) >= 7: # an edge line must have at least 7 fields
+
+ target_id = lst[0].split()[0] # AGI id only, no gene name
+ tf_id = lst[1].split()[0] # same as above
+ k = target_id + '_' + tf_id # key
+ date = '20161201' # edge creation date. if there's no date field, use 20161201
+ rcond_s = get_rcond_string(lst[4]) # remove extra 0's from old (obsolete) RNA-seq experiment ids
+ loglik = lst[6]
+ t = lst[3] # current line edge type, mix or all
+ if '=' in loglik:
+ loglik = loglik.split('=')[1]
+ if t == 'mix' and loglik == '.': # post translation case, i.e., post.translation.3, see create_edegs4.py
+ loglik = '-1000.0'
+
+ tissue_or_method_name = ll_dict[loglik] if loglik in ll_dict else 'MixReg'
+
+ if len(lst) == 8: # has a date field
+ date = lst[7]
+
+ # add for each edge an tissue or method information. For example, the edge is based on a certain tissue, or is derived by a certain method. ll_dict contains the mapping.
+ if not k in d_tissue:
+ d_tissue[k] = [tissue_or_method_name]
+ else:
+ d_tissue[k].append(tissue_or_method_name)
+
+ if not k in d: # first edge to be added between two nodes: target_id and tf_id
+ d[k] = [{'target':lst[0], 'tf':lst[1], 'value':lst[2], 'type':lst[3], 'rcond':rcond_s, 'ccond':lst[5], 'loglik':loglik, 'date':date}] # a list of dicitionaries, {target, tf, value, type, rcond, ccond, loglik, date}
+ else: # two nodes can have multiple edges. an edge already exists, determine whether or not add this new edge.
+ v = lst[2] # current line value
+ rcond = rcond_s
+ ccond = lst[5]
+ len_r = len(rcond.split()) # number of RNA-seq experiment ids in current line. if it is a dot, then length is 1
+ ignore = False # assume this edge is to be included. will be set False if otherwise.
+
+ for i in range(len(d[k])): # search each of already added edges. If current one is better, replace the old one
+ xd = d[k][i] # xd is a dictionary, representing an edge {target, tf, type, value, rcond, ccond, loglik, date}
+ fv = float(v) # value (confidence) of the candiadte edge
+ fx = float(xd['value'])
+ len_rx = len(xd['rcond'].split())
+ if xd['type'] == t and t == 'all': # edge type is all, use most recent result as the edge is based on all RNA-seq experiments.
+ ignore = True # either ignore it or replace the old with this one
+ if date > xd['date']:
+ d[k][i]['value'] = v
+ d[k][i]['date'] = date
+ break
+
+ if xd['type'] == t and t == 'mix': # current line represents a better edge, i.e., larger r value and based on more RNA-seq experiments
+ if fv*fx > 0 and abs(fv*np.log10(len_r)) > abs(fx*np.log10(len_rx)): # fv*fx > 0 means they have the same sign.
+ d[k][i]['value'] = v
+ d[k][i]['rcond'] = rcond
+ d[k][i]['ccond'] = ccond
+ d[k][i]['loglik'] = loglik
+ d[k][i]['date'] = date
+ ignore = True
+ break
+ elif fv*fx > 0: # curr line has same sign, but based on less RNA-seq experiment, ignore it.
+ ignore = True
+ break
+
+ if v == xd['value'] and len_r == len_rx: # ChIPs are updated, but based on same number of rna-seq experiments
+ if xd['ccond'] != ccond:
+ merged_cond = xd['ccond'] + ' ' + ccond
+ merged_cond_str = ' '.join(sorted(list(set(merged_cond.split()))))
+ d[k][i]['ccond'] = merged_cond_str
+
+ if xd['date'] < date:
+ d[k][i]['date'] = date
+ d[k][i]['loglik'] = loglik
+ ignore = True
+ break
+
+ if ignore == False:
+ d[k].append({'target':lst[0], 'tf':lst[1], 'value':lst[2], 'type':lst[3], 'rcond':rcond_s, 'ccond':lst[5], 'loglik':loglik, 'date':date})
+
+ # fill d_mix_pos, d_mix_neg and d_all
+ curr_rcond_size = len(rcond_s.split())
+ if t == 'mix':
+ if float(lst[2]) >= 0: # lst[2] is value
+ if not k in d_mix_pos:
+ d_mix_pos[k] = {'freq':1, 'total_RNAseq_ID':curr_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_mix_pos[k]['freq'] += 1
+ d_mix_pos[k]['total_RNAseq_ID'] += curr_rcond_size
+ d_mix_pos[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_mix_pos[k]['most_recent_date']:
+ d_mix_pos[k]['most_recent_date'] = date
+ else:
+ if not k in d_mix_neg:
+ d_mix_neg[k] = {'freq':1, 'total_RNAseq_ID':curr_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_mix_neg[k]['freq'] += 1
+ d_mix_neg[k]['total_RNAseq_ID'] += curr_rcond_size
+ d_mix_neg[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_mix_neg[k]['most_recent_date']:
+ d_mix_neg[k]['most_recent_date'] = date
+
+
+ if curr_rcond_size > max_rcond_size:
+ max_rcond_size = curr_rcond_size
+
+ if t == 'all':
+ all_rcond_size = get_sample_size(sample_size_dict, sample_size_keys, date)
+ if not k in d_all:
+ d_all[k] = {'freq':1, 'total_RNAseq_ID':all_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_all[k]['freq'] += 1
+ d_all[k]['total_RNAseq_ID'] += all_rcond_size
+ d_all[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_all[k]['most_recent_date']:
+ d_all[k]['most_recent_date'] = date
+
+ # rewrite all total_RNAseq_ID value, the total RNAseq ID is actually an overestimate.
+ if len(sample_size_dict) == 0: # sample size file is not available
+ for k in d_all:
+ d_all[k]['total_RNAseq_ID'] = d_all[k]['freq'] * max_rcond_size
+
+ f = open(edge_fname, 'w')
+ for k in sorted(d.keys()):
+ lst = d[k] # a list of dictionaries
+ tissue_or_method = ', '.join(list(set(d_tissue[k])))
+ for xd in lst:
+ if xd['type'] == 'all':
+ metric = '%4.2f' % ( compute_metric(d_all[k]) )
+ s = '\t'.join([xd['target'], xd['tf'], xd['value'], xd['type'], xd['rcond'], xd['ccond'], xd['loglik'], xd['date'], metric, tissue_or_method]) + '\n'
+ if xd['type'] == 'mix':
+ if float(xd['value']) >= 0:
+ metric = '%4.2f' % ( compute_metric(d_mix_pos[k]) )
+ else:
+ metric = '%4.2f' % ( compute_metric(d_mix_neg[k]) )
+ s = '\t'.join([xd['target'], xd['tf'], xd['value'], xd['type'], xd['rcond'], xd['ccond'], xd['loglik'], xd['date'], metric, tissue_or_method]) + '\n'
+ f.write(s)
+ f.close()
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+def make_data_dict(fname):
+ d = {} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith('@'):
+ s = line[line.rfind('@')+1:]
+ s = s.strip()
+ if s in d:
+ write_log_file('In make_data_dict: ID %s duplicated' % (s), LOG_FILE)
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_bad_chip_ids(d):
+ ''' a id chip is bad if its Note field has obsolete '''
+ lst = []
+ for k in d:
+ note = d[k]['NOTE'].lower()
+ if 'obsolete' in note:
+ lst.append(k)
+ return lst
+
+def get_update_date_chip_ids(d):
+ ''' Return a list of ChIP ids with update in the NOTE: field. '''
+ ud = {}
+ for k in d:
+ note = d[k]['NOTE'].lower()
+ if 'update' in note:
+ if 'update:' in note: # has a specific date, e.g., 20170101
+ idx = note.find('update:')
+ udate = note[idx+7:idx+15] # get date string yyyymmdd
+ else: # if only update, but no specific date, assume it is 20161201
+ udate = '20161201'
+ ud[k] = udate
+ return ud
+
+
+def get_chip_ids_from_edge_file(fname):
+ lst = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ l = line.split('\t')
+ c = l[5] # ids for ChIP experiments
+ l2 = c.split()
+ lst.extend(l2)
+
+ f.close()
+ return list(set(lst))
+
+def get_nonexistent_chip_ids(d, fname):
+ ''' fname -- edges.txt. Get ids that are in fname, but not in d. '''
+ lst = []
+ ids = get_chip_ids_from_edge_file(fname)
+ for k in ids:
+ if not k in d:
+ lst.append(k)
+ return lst
+
+
+def created_after_update(x, created_date, update_date_chip_ids):
+ ''' the edge is created after the ChIP is updated.'''
+ if not x in update_date_chip_ids: # update_date_chip_ids dose not contain id x, which means x is not updated since ChIP id is created
+ return True
+ else:
+ return created_date >= update_date_chip_ids[x] # check whether edge is created after x is updated.
+
+
+def rm_chip_ids_from_edge_file(fname, bad_id_lst, update_date_chip_ids):
+ ''' fname -- edges.txt '''
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ f = open(fname, 'w')
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ c = lst[5] # chip experiment ids
+ l = c.split() # list of chip experiment ids
+ created_date = lst[7]
+ l2 = [] # for keeping good chip experiment ids
+ for x in l:
+ if not x in bad_id_lst and created_after_update(x, created_date, update_date_chip_ids): # if an edge is created before this ChIP experiment is updated, then the binding information and thus the edge may be no longer valid. If this ChIP experiment is the sole evidence of binding, then this edge is ignored.
+ l2.append(x)
+ if l2 != []: # still have some chips
+ lst[5] = ' '.join(l2)
+ f.write('\t'.join(lst) + '\n')
+ f.close()
+
+def make_file(fname, s):
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+def get_chip_ids(fname):
+ if not os.path.exists(fname):
+ print('ERROR: %s not exists.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ head_line = lines[0].strip()
+ lst = head_line.split('\t')
+ return lst[1:]
+
+
+def is_recent(note, ndays): # less than 2-days old
+ idx = note.find('update:')
+ if idx == -1: # not found
+ return False
+ udate = note[idx+7:idx+15] # get date string yyyymmdd
+ curr_date = datetime.now().strftime('%Y%m%d')
+ d1 = datetime.strptime(udate, "%Y%m%d")
+ d2 = datetime.strptime(curr_date, "%Y%m%d")
+ return (d2 - d1).days <= ndays
+
+
+def get_note_date(s):
+ ''' s has this format: NOTE: update:20170101 '''
+ if not 'update:' in s: # if people forgot to put an update date in Note:, then it is 20161201
+ return '20161201'
+ index = s.rfind('update:')
+ if len(s[index+7:]) < 8:
+ return '20161201'
+ result = s[index+7:index+15].strip()
+ if result <= '20170101' or len(result) < 8:
+ return '20161201'
+ return result
+
+
+def is_recently_updated(note, chip_id, fname):
+ ''' fname keeps track of chip ids and their most recent update date '''
+ if not is_recent(note, 15): # if update happened 15 days ago, just ignore it. no further action will be taken for this chip id.
+ return False
+
+ note_date = get_note_date(note)
+
+ log_date = '20161201'
+ if os.path.exists(fname): # search update date of chip_id, if it has been incorperated, then log date should be no less than note's update date
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line != '':
+ lst = line.split()
+ if len(lst) >= 2 and lst[0] == chip_id:
+ log_date = lst[1]
+ f.close()
+
+ return note_date > log_date
+
+def get_new_chip_ids(old_lst, para_c_dict, new_or_updated_chip_fname):
+ result = []
+ for k in para_c_dict:
+ note = para_c_dict[k]['NOTE'].lower()
+ if 'obsolete' in note:
+ continue
+ if (not 'obsolete' in note and not k in old_lst) or (k in old_lst and 'update:' in note and is_recently_updated(note, k, new_or_updated_chip_fname)): # First case: the ChIP-seq ID is not in old list and not marked obsolte. Definitely add it. A second case is more subtle. The narrowPeak of an old ChIP-seq ID has recently been updated for whatever purposes. This update should happen reasonably recently. Why is_recently_updated is important? Because we don't want to treat ChIP-seq that is updated weeks ago as new. new_or_updated_chip_fname keeps track of newly updated ChIP-seq and their update dates.
+ result.append(k)
+ return sorted(result)
+
+def edit_file(fname, line_starter, new_str):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ f = open(fname, 'w')
+ for line in lines:
+ if line.startswith(line_starter):
+ s = line_starter
+ s += new_str
+ f.write(s + '\n')
+ else:
+ f.write(line)
+ f.close()
+
+def number_rnaseq_id(tpm_file):
+ f = open(tpm_file)
+ first_line = f.readlines()[0]
+ f.close()
+ first_line = first_line.strip()
+ return len(first_line.split()) - 1
+
+def number_rnaseq_diff(para_file, tpm_file):
+ ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
+ a = 0
+ f = open(para_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ a += 1
+ f.close()
+
+ b = number_rnaseq_id(tpm_file)
+
+ return a - b
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one ChIP-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_buildrmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ sys.exit()
+
+ if not 'GENE_LIST' in d:
+ print('Must specify GENE_LIST.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one RNA-seq.')
+ sys.exit()
+
+def validate_parameter_for_net(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_ID_AND_GENE_NAME':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'EXPRESSION_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_MATRIX':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ #print('Use python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if k == 'EXPRESSION_MATRIX':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ print('Use python buildRmatrix.py parameter_for_buildRmatrix.txt to create TPM.txt.')
+
+ if not 'GENE_LIST' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'GENE_ID_AND_GENE_NAME' in d:
+ print('Must specify GENE_ID_AND_GENE_NAME.')
+ sys.exit()
+ if not 'BINDING_INFO' in d:
+ print('Must specify BINDING_INFO.')
+ sys.exit()
+ if not 'EXPRESSION_INFO' in d:
+ print('Must specify EXPRESSION_INFO.')
+ sys.exit()
+ if not 'BINDING_MATRIX' in d:
+ print('%s not exists.' % (v))
+ print('Use python buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if not 'EXPRESSION_MATRIX' in d:
+ print('%s not exists.' % (v))
+ print('Use python buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+
+def file_contains(fname, s):
+ if not os.path.exists(fname):
+ return False
+ f = open(fname)
+ for line in f:
+ if s in line:
+ return True
+ f.close()
+ return False
+
+def check_required_packages():
+ # mixtools, networkx, rjson, flask
+
+ # python libraries
+ try:
+ import numpy
+ except ImportError, e:
+ print('numpy not available. Install it via: pip install numpy')
+ sys.exit()
+
+ try:
+ import networkx
+ except ImportError, e:
+ print('networkx not available. Install it via: pip install networkx')
+ sys.exit()
+
+ try:
+ import flask
+ except ImportError, e:
+ print('flask not available. Install it via: pip install flask')
+ sys.exit()
+
+ # R libraries
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'mixtools\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package mixtools not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'rjson\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package rjson not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'Rtsne\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package Rtsne not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+## main
+## Shipped with this distribution: TPM.txt.gz, experiment.and.tissue.txt, rnaseq_info_database.json
+## BINDING_FILE='../Data/history/bind/binding.txt' is not given, as users should provide a BED file to create it.
+
+if not os.path.isdir(CODE_DIR): # make sure that CODE_DIR exists
+ print('ERROR: %s does not exists.' % (CODE_DIR))
+ sys.exit()
+os.chdir(CODE_DIR) # run this file at the Code directory
+if not os.path.isdir('../Data/C/Mapped'):
+ make_important_dirs() # make important directories (if non-existentt) for holding all kinds of files, must be after os.chdir(CODE_DIR)
+ write_log_file('Copy BED files to Data/C/Mapped. Update the LOCATION field in Data/parameter/parameter_for_buildCmatrix.txt. Run local_network.py again.' , LOG_FILE)
+ sys.exit()
+ok_webapp_dir(PARAMETER_FOR_NET) # make sure Webapp contains necessary files
+
+# BEFORE WE START, WE SHOULD CHECK REQUIRED SOFTWARES ARE INSTALLED.
+# For example, numpy, mixtools, networkx, rjson, flask (TBD)
+# make sure required packages are available
+check_required_packages()
+
+# check rnaseq_info_database.txt and rnaseq_info_database.json
+if not os.path.exists(RNA_SEQ_INFO_DATABASE):
+ write_log_file('NEED CREATE %s.' % (RNA_SEQ_INFO_DATABASE), LOG_FILE)
+ sys.exit()
+if not os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ write_log_file('NEED CREATE %s.' % (RNA_SEQ_INFO_DATABASE_JSON), LOG_FILE)
+ sys.exit()
+
+# make sure parameter files are present and valid (very rudimentary check but important)
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+#validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
+validate_parameter_for_net(PARAMETER_FOR_NET)
+
+# remove binding file, if any
+if os.path.exists(BINDING_FILE):
+ os.remove(BINDING_FILE)
+
+# update edges.txt, a merged file from several sources, HISTORY_DIR and HISTORY_DIR2.
+edge_file_lst = [] # collect edge files.
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+
+# make sure all needed files are present, if not, make them if possible
+miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
+if (miss_lst != [] and edge_file_lst == []) or FORCE_MAKE_EDGES == 'YES':
+ write_log_file('Cannot find these required files: %s. The program will prepare them.' % (', '.join(miss_lst)), LOG_FILE)
+
+ # initially, we only have three parameter files, but not binding.txt
+ important_miss_number = 0
+ if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_BUILDCMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_BUILDRMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_NET in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_NET))
+ important_miss_number += 1
+
+ if important_miss_number > 0:
+ sys.exit() # need to provide all the above three files; otherwise cannot proceed
+
+ target_tf_fname = '../Data/information/target_tf.txt'
+ if os.path.exists(target_tf_fname):
+ os.remove(target_tf_fname)
+
+ if BINDING_FILE in miss_lst:
+ write_log_file('Make initial binding.txt ...', LOG_FILE)
+ cmd = 'python get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
+ os.system(cmd)
+ cmd = 'python buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
+ os.system(cmd)
+ #print('IMPORATNT:Now check BINDING_MATRIX in %s is set %s and rerun local_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
+ #sys.exit()
+
+ if TPM_FILE in miss_lst:
+ if not os.path.exists(TPM_FILE + '.gz'):
+ write_log_file('Cannot find %s. Try %s.' % (TPM_FILE + '.gz', TPM_FILE), LOG_FILE)
+ if not os.path.exists(TPM_FILE):
+ sys.exit()
+ else:
+ write_log_file('Unzip initial TPM.txt', LOG_FILE)
+ cmd = 'gunzip %s' % (TPM_FILE + '.gz')
+ os.system(cmd)
+
+ #print('IMPORTANT:Now check EXPRESSION_MATRIX in %s is set %s and rerun local_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
+ #sys.exit()
+
+ miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
+ if (len(miss_lst2) == 1 and miss_lst2[0] == EDGE_FILE) or (len(miss_lst2) == 0 and os.path.getmtime(EDGE_FILE) < os.path.getmtime(BINDING_FILE)): # all other files are ready except edges.txt, make one.
+ # should assert the files needed by the following scripts are present
+ # big correlation matrix may not eat all computer's memory, so need to modify the code. or test the user's memory first.
+ print('Make some edges, wait ... Change MAX_PROCESS in Code/create_edges4.py to change the number of processes. Default=10.')
+ cmd = 'nohup python create_edges4.py %s &' % (PARAMETER_FOR_NET) # this will create target_tf.txt needed by the following scripts
+ #os.system(cmd)
+ time.sleep(10)
+
+ # wait or make target_tf.txt
+ wait_sec = 0
+ WAIT_SECONDS = 120
+ while not os.path.exists(target_tf_fname):
+ time.sleep(WAIT_SECONDS)
+ wait_sec += WAIT_SECONDS
+ if wait_sec > WAIT_SECONDS * 5:
+ write_log_file('Make Data/information/target_tf.txt', LOG_FILE)
+ cmd = 'python make_target_tf.py %s > %s' % (PARAMETER_FOR_NET , target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+ os.system(cmd)
+ break
+
+ time.sleep(5)
+ write_log_file('Create group-specific (fixed) edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript correlation_per_group_fixed_number.R &'
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create SIMPLE edges.txt using new TPM.txt (size=%d). SIMPLE means using all RNA-seq samples.' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'python create_edges0.py %s &' % (PARAMETER_FOR_NET) # use all samples in TPM.txt results will be written to history/edges.txt.simple.correlation.all.conditions
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create tissue-specific edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'python create_edges0B.py %s &' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create group-specific edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript correlation_per_group.R &'
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create wedge-shape edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript wedge.R &'
+ os.system(cmd)
+
+
+# make json (sliced TPM.txt) and json2 (sliced binding.txt) if they don't exist
+if os.path.exists(TPM_FILE):
+ if not os.path.isdir('../Data/history/expr/json'):
+ write_log_file('Make directory ../Data/history/expr/json', LOG_FILE)
+ cmd = 'python slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+if os.path.exists(BINDING_FILE):
+ if not os.path.isdir('../Data/history/bind/json2'):
+ write_log_file('Make directory ../Data/history/bind/json2', LOG_FILE)
+ cmd = 'python slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+ elif os.path.getmtime('../Data/history/bind/json2') < os.path.getmtime(BINDING_FILE):
+ write_log_file('Make directory ../Data/history/bind/json2', LOG_FILE)
+ cmd = 'python slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+# if the file timestamp does not exist, create one
+if not os.path.exists(FILE_TIMESTAMP):
+ record_file_time(FILE_LIST_TO_CHECK)
+
+# get update time of must-have files
+timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)
+
+# update edges.txt, a merged file from several sources, HISTORY_DIR and HISTORY_DIR2.
+edge_file_lst = [] # collect edge files.
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+if edge_file_lst == []:
+ write_log_file('No files to merge. Run this script again a few hours later.', LOG_FILE)
+ sys.exit()
+
+
+
+# merge edge files
+write_log_file('Merge edge files ...', LOG_FILE)
+merge_edges(edge_file_lst, EDGE_FILE, SAMPLE_SIZE_FILE) # merge individual files to EDGE_FILE, so EDGE_FILE is always updated. A new field, metric, is appended.
+
+# delete edges if their supporting ChIP expriments become obsolete
+#write_log_file('Delete edges ...', LOG_FILE)
+para_c_dict = make_data_dict(PARAMETER_FOR_BUILDCMATRIX)
+bad_chip_ids = get_bad_chip_ids(para_c_dict) # e.g., those marked with obsolete in parameter_for_buildCmatrix.txt
+nonexistent_chip_ids = get_nonexistent_chip_ids(para_c_dict, EDGE_FILE) # in edges.txt but not in parameter_for_buildCmatrix.txt, either because it is removed or commented out
+bad_chip_ids.extend(nonexistent_chip_ids)
+bad_chip_ids = list(set(bad_chip_ids)) # unique elements
+update_date_chip_ids = get_update_date_chip_ids(para_c_dict) # a dictionary, get the update date of each ChIP experiment with update in Note field.
+rm_chip_ids_from_edge_file(EDGE_FILE, bad_chip_ids, update_date_chip_ids) # edges.txt is updated, with bad chip ids removed.
+nlines = num_line(EDGE_FILE)
+write_log_file('Number of total edges %d.' % (nlines), LOG_FILE)
+if nlines == 0:
+ write_log_file('Empty edges.txt.', LOG_FILE)
+ sys.exit()
+
+
+# get a list of updated files
+# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+updated_file_list = ['edges.txt']
+# check edges.txt, if updated, re-make static html summary
+if 'edges.txt' in updated_file_list: # if edges.txt is updated
+ write_log_file('Rebuild html pages ...', LOG_FILE)
+ cmd = 'python html_network.py -f %s -r %s -c %s -n %s' % (EDGE_FILE, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_NET) # produce edges and summary.html
+ os.system(cmd)
+
+ # update webapp folder
+ cmd = 'cp %s ../Webapp/static/edges/' % (EDGE_FILE)
+ os.system(cmd)
+
+ # kill and restart start_webapp.py
+ # write_log_file('Terminate start_webapp ...', LOG_FILE)
+ # cmd = 'kill $(ps aux | grep \'[p]ython start_webapp\' | awk \'{print $2}\')'
+ # os.system(cmd)
+ # write_log_file('Restart start_webapp ...', LOG_FILE)
+ # os.chdir('../Webapp')
+ # os.system('python start_webapp.py &')
+ # os.chdir(CODE_DIR) # return to CODE_DIR
+
+
+# update time stamp file
+record_file_time(FILE_LIST_TO_CHECK)
+
+# remove .R files in Data/temp, files older than 3 days will be removed
+cmd = 'find %s -mtime +1 -name \"*.R\" -delete' % (TEMP_DIR)
+os.system(cmd)
+
+write_log_file('Network update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), LOG_FILE)
+
+print('Creating edges... Wait one hour to have edges ready.\nUncomment app.run(debug=True) in Webapp/start_webapp.py and comment out the previous app.run().\nTo display the network, cd Webpap && python start_webapp.py. Enter http://127.0.0.1:5000 in the address bar in your browser.')
diff --git a/Code/make_graphviz_file3B.py b/Code/make_graphviz_file3B.py
new file mode 100644
index 0000000..3ccd870
--- /dev/null
+++ b/Code/make_graphviz_file3B.py
@@ -0,0 +1,236 @@
+# Usage: python make_graphviz_file3B.py AT1G19850
+#
+# Make plot: python make_graphviz_file3B.py AT1G65480 | dot -Tpdf -o result.pdf result.gv
+# python make_graphviz_file3B.py AT1G65480 | neato -Goverlap=false -Tpdf -o result.pdf result.gv
+#
+# The plot is saved in result.pdf, and each little grey box contains a tissue name.
+# Change 'pdf' to 'svg' to get a vector image. Tissue name is in yellow box. Double circle represents both a regulator and a regulatee.
+# Egg represents a regulatee. Oval represent a regulator. Yellow arrow regulating. Red arrow being regulated.
+#
+# Input file is specified in variable edge_file (result.skeleton.txt). This file is generated by test_network4.py.
+# The tissue name is contained in the lines starting with '##', e.g., '##TF skeleton size in shoot: 15735.' contains 'shoot'.
+# Edit the variable tissue_colour_dict and tissue_lst in function get_tissue_from_fname() to match with the tissue names.
+#
+#
+# Purpose: Generate result.gv for Graphviz software dot. The single
+# parameter AT1G19850 is a TF. result.gv contains all edges from/to the TF
+# in each tissue. A tissue is a subgraph. We can
+# convert result.gv to a figure using 'dot -Tpdf -o result.pdf
+# result.gv'.
+#
+# Created 6 July 2017, hui, slcu
+# Last modified 11 July 2017, hui, slcu
+
+import random
+import numpy as np
+import sys
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+NUM_TARGETS_CUTOFF = 5
+
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+
+def get_edge(fname):
+ ''' Return d = {'flower':{'tf':[target1,target2, ...]}, 'seed':{}} '''
+ d = {}
+ d2 = {} # the actual correlation coefficient, absolute value
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not line.startswith('#'):
+ lst = line.split('\t')
+ target = (lst[0].split('_'))[0]
+ tf = (lst[1].split('_'))[0]
+ if not tf in d[tissue]:
+ d[tissue][tf] = [target]
+ else:
+ d[tissue][tf].append(target)
+
+ strength = abs(float(lst[2]))
+ if not tf in d2[tissue]:
+ d2[tissue][tf] = {target:strength}
+ else:
+ d2[tissue][tf][target] = strength
+
+ else:
+ tissue = get_tissue_from_fname(line)
+ d[tissue] = {}
+ d2[tissue] = {}
+ f.close()
+ return d, d2
+
+
+def in_same_tissue(source, target, node_dict):
+ return node_dict[source] == node_dict[target]
+
+def make_label(a, b):
+ if b == '.':
+ return a
+ else:
+ lst = b.split(';')
+ return a + ' ' + lst[0]
+
+
+def has_predecessor(tf, d):
+ for k in d:
+ if tf in d[k] and k != tf:
+ return True
+ return False
+
+def get_num_successors(tf, d):
+ if not tf in d:
+ return 0
+ return len(d[tf])
+
+def get_shape(tf, d):
+ ''' d = {'tf':[target1, target2]} '''
+ p = has_predecessor(tf, d)
+ s = get_num_successors(tf, d)
+ if s > 0 and p: # tf is both a regulator and a regulatee
+ return 'doublecircle'
+ if s > 0 and not p: # a regulator
+ return 'oval' # regulator
+ if p and s == 0: # a regulatee
+ return 'egg' # regulatee
+ return 'point'
+
+def get_color(tf, edge_dict, tissue):
+ #colours = ['darkolivegreen1', 'darkolivegreen2', 'darkolivegreen3', 'darkolivegreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4', 'darkgoldenrod', 'darkgoldenrod4']
+ #colours = ['snow', 'snow1', 'snow2', 'snow3', 'snow4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ colours = ['springgreen', 'springgreen1', 'springgreen2', 'springgreen3', 'springgreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4'] # darker colours means more important for that tissue
+ d = {}
+ total = 0
+ for k in edge_dict:
+ n = get_num_successors(tf, edge_dict[k])
+ d[k] = n
+ total += n
+ #print('%s %d' % (k, n))
+ if total == 0: # no successor
+ return 'azure'
+ return colours[min(int(10 * 1.0 * d[tissue] / total), len(colours)-1)]
+
+def write_graphviz_file(fname, edge_dict, colour_dict, agi2name_dict, query_tf):
+
+ f = open(fname, 'w')
+
+ graph_dict = {} # record for each tissue the graph
+ last_node = {} # record the last node added in each subgraph
+ for k in edge_dict:
+ graph_dict[k] = {'head':'', 'nodes':[], 'edges':[]}
+
+ for k in edge_dict: # k is tissue
+ node_added_dict = {} # make sure we don't add the same node twice
+ edge_added_dict = {} # make sure an edge is not added twice
+ tissue_node = '%s_node' % (k)
+ graph_dict[k]['head'] = ''
+ d = edge_dict[k] # d = {'tf1':[target1, target2, ...]}
+ tf_lst = d.keys()
+ for tf in tf_lst:
+ node_tf = tf + '_' + k
+ if tf == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k) # shape's boundary colour
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ for target in d[tf]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ node_target = target + '_' + k
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, k)
+ if not target in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[k], shape))
+ node_added_dict[target] = 'YES'
+ last_node[k] = node_target
+
+ edge_key = tf + target
+ if not edge_key in edge_added_dict:
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'gold')) # out-going edge
+ edge_added_dict[edge_key] = 'YES'
+
+ else: # check if tf is a target of another tf
+ for target in d[tf]:
+ if target == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ node_tf = tf + '_' + k
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ node_target = target + '_' + k
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ last_node[k] = node_target
+ edge_key = tf + target
+ if not edge_key in edge_added_dict:
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'red'))
+
+ if graph_dict[k]['nodes'] != []:
+ node_label = k + '_label_node'
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", shape=box, color=yellow, style=filled, height=0.8, width=1.6];\n' % (node_label, k.upper()))
+
+ # write graphviz file
+ s0 = 'digraph G {\n graph[splines=true, ranksep=2, fontname=Arial];\n node[fontname=Arial];\n'
+ s0 += ' {rank=sink; ' # move label node to bottom
+ for k in last_node:
+ if graph_dict[k]['nodes'] != []:
+ node_label = k + '_label_node'
+ s0 += '%s;' % (node_label)
+ s0 += '}\n'
+ for k in graph_dict:
+ s0 += graph_dict[k]['head']
+ node_label = k + '_label_node'
+ for x in graph_dict[k]['nodes']:
+ s0 += x
+ for x in graph_dict[k]['edges']:
+ s0 += x
+ if k in last_node:
+ s0 += ' \"%s\" -> \"%s\" [arrowhead=none, style=invis];\n' % (last_node[k], node_label)
+
+ s0 += '}\n'
+ f.write(s0)
+ f.close()
+
+
+# main
+
+GENE_ID_TO_GENE_NAME = '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file = 'result.skeleton.txt' # prepared by test_network4.py
+
+tissue_colour_dict = {
+ 'seedling':'greenyellow',
+ 'meristem':'skyblue4',
+ 'flower':'lightpink',
+ 'aerial':'cyan',
+ 'shoot':'forestgreen',
+ 'seed':'black',
+ 'leaf':'green',
+ 'root':'gold',
+ 'stem':'orange4'}
+
+if len(sys.argv) < 2:
+ print('Need to specifiy a gene ID, e.g., AT1G19850.')
+ sys.exit()
+else:
+ query_tf = sys.argv[1]
+
+edge_dict, edge_dict_r = get_edge(edge_file)
+write_graphviz_file('result.gv', edge_dict, tissue_colour_dict, agi2name_dict, query_tf)
diff --git a/Code/make_graphviz_file3C.py b/Code/make_graphviz_file3C.py
new file mode 100644
index 0000000..125c462
--- /dev/null
+++ b/Code/make_graphviz_file3C.py
@@ -0,0 +1,273 @@
+# Usage: python make_graphviz_file3C.py AT1G19850
+#
+# Make plot: python make_graphviz_file3C.py AT1G65480 | sfdp -Goverlap=false -Tpdf -o result.flower.pdf result.flower.gv
+# unflatten -f -l 3 result.flower.gv | dot -Tsvg -o result.flower.svg
+#
+# Purpose: Generate result.txt for Graphviz software dot. The single
+# parameter AT1G19850 is a TF.
+# The query neighbours of the TF's neighbours are also shown.
+#
+# Created 10 July 2017, hui, slcu
+
+import random
+import numpy as np
+import sys
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+PERCENT = 1
+NUM_TARGETS_CUTOFF = 5
+
+def get_tf_tissue(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ head = lines[0].strip()
+ head_lst = head.split('\t')
+ head_lst = head_lst[1:] # remove TF
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ lst2 = lst[1:]
+ lst3 = [int(x) for x in lst2]
+ lst4 = np.array(lst3)
+ median_val = np.median(lst4)
+ tissue = []
+ for i in range(len(lst2)):
+ if int(lst2[i]) >= max(median_val, 1) or int(lst2[i]) >= NUM_TARGETS_CUTOFF:
+ tissue.append(head_lst[i])
+ tf = (lst[0].split())[0]
+ d[tf] = tissue # tf is assigned with a list of tissues, the tissue with node degree greater than median are selected.
+ return d
+
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+
+def get_edge(fname):
+ ''' Return d = {'flower':{'tf':[target1,target2, ...]}, 'seed':{}} '''
+ d = {}
+ d2 = {} # the actual correlation coefficient, absolute value
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not line.startswith('#'):
+ lst = line.split('\t')
+ target = (lst[0].split('_'))[0]
+ tf = (lst[1].split('_'))[0]
+ if not tf in d[tissue]:
+ d[tissue][tf] = [target]
+ else:
+ d[tissue][tf].append(target)
+
+ strength = abs(float(lst[2]))
+ if not tf in d2[tissue]:
+ d2[tissue][tf] = {target:strength}
+ else:
+ d2[tissue][tf][target] = strength
+
+ else:
+ tissue = get_tissue_from_fname(line)
+ d[tissue] = {}
+ d2[tissue] = {}
+ f.close()
+ return d, d2
+
+
+def in_same_tissue(source, target, node_dict):
+ return node_dict[source] == node_dict[target]
+
+
+def make_label(a, b):
+ if b == '.':
+ return a
+ else:
+ lst = b.split(';')
+ return a + '_' + lst[0]
+
+
+def has_predecessor(tf, d):
+ for k in d:
+ if tf in d[k] and k != tf:
+ return True
+ return False
+
+def get_num_successors(tf, d):
+ if not tf in d:
+ return 0
+ return len(d[tf])
+
+def get_shape(tf, d):
+ ''' d = {'tf':[target1, target2]} '''
+ p = has_predecessor(tf, d)
+ s = get_num_successors(tf, d)
+ if s > 0 and p:
+ return 'doublecircle'
+ if s > 0:
+ return 'ellipse' # regulator
+ if p:
+ return 'egg' # regulatee
+
+
+def get_color(tf, edge_dict, tissue):
+ #colours = ['darkolivegreen1', 'darkolivegreen2', 'darkolivegreen3', 'darkolivegreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4', 'darkgoldenrod', 'darkgoldenrod4']
+ #colours = ['snow', 'snow1', 'snow2', 'snow3', 'snow4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ colours = ['springgreen', 'springgreen1', 'springgreen2', 'springgreen3', 'springgreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ d = {}
+ total = 0
+ for k in edge_dict:
+ n = get_num_successors(tf, edge_dict[k])
+ d[k] = n
+ total += n
+ #print('%s %d' % (k, n))
+ if total == 0:
+ return 'azure'
+ return colours[min(int(10 * 1.0 * d[tissue] / total), len(colours)-1)]
+
+
+def make_more_string(n, tissue, edge_dict, colour_dict, agi2name_dict, query_tf):
+ result = ''
+ d = edge_dict[tissue]
+ if n in d:
+ for target in d[n]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, tissue)
+ node_target = target + '_' + tissue + '.2'
+ if target != query_tf:
+ result += ' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[tissue], shape)
+
+ node_n = n + '_' + tissue
+ if random.uniform(0, 1) <= PERCENT:
+ result += ' \"%s\" -> \"%s\" [color=%s];\n' % (node_n, node_target, 'gold')
+
+ for tf in d:
+ if tf != query_tf:
+ if n in d[tf]: # n is successor
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, tissue)
+ node_tf = tf + '_' + tissue + '.2'
+ result += ' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[tissue], shape)
+
+ node_n = n + '_' + tissue
+ if random.uniform(0, 1) <= PERCENT:
+ result += ' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_n, 'red')
+
+ return result
+
+
+def write_graphviz_file(fname, edge_dict, colour_dict, agi2name_dict, query_tf):
+
+ f = open(fname, 'w')
+
+ graph_dict = {}
+ more = {}
+ for k in edge_dict:
+ graph_dict[k] = {'head':'', 'nodes':[], 'edges':[]}
+
+ for k in edge_dict: # k is tissue
+ neighbours = []
+ node_added_dict = {}
+ tissue_node = '%s_node' % (k)
+ graph_dict[k]['head'] = ''
+ d = edge_dict[k]
+ tf_lst = d.keys()
+ for tf in tf_lst:
+ node_tf = tf + '_' + k
+ if tf == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, 'Query gene: '+ll, 'DeepSkyBlue', colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ for target in d[tf]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ node_target = target + '_' + k
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, k)
+ if random.uniform(0, 1) <= PERCENT:
+ if not target in node_added_dict:
+ neighbours.append(target)
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[k], shape))
+ node_added_dict[target] = 'YES'
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'gold'))
+ else: # check if tf is a target of another tf
+ for target in d[tf]:
+ if target == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ node_tf = tf + '_' + k
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ node_target = target + '_' + k
+ if random.uniform(0, 1) <= PERCENT:
+ if not tf in node_added_dict:
+ neighbours.append(tf)
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'red'))
+ neighbours = list(set(neighbours))
+ more[k] = ''
+ for n in neighbours:
+ more[k] += make_more_string(n, k, edge_dict, colour_dict, agi2name_dict, query_tf)
+
+
+ for k in graph_dict:
+ if graph_dict[k]['nodes'] != []:
+ f = open(fname + '.' + k + '.gv', 'w')
+ s0 = 'digraph G {\n graph[splines=true, ranksep=3, fontname=Arial];\n node[fontname=Arial];\n'
+ s0 += graph_dict[k]['head']
+ for x in graph_dict[k]['nodes']:
+ s0 += x
+ for x in graph_dict[k]['edges']:
+ s0 += x
+ if k in more:
+ s0 += more[k]
+ s0 += '}\n'
+ f.write(s0)
+ f.close()
+
+# main
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file = 'result.skeleton.txt' # prepared by test_network4.py
+node_file = 'result.out.txt' # prepared by test_network4.py
+
+tissue_colour_dict = {
+ 'seedling':'greenyellow',
+ 'meristem':'skyblue4',
+ 'flower':'lightpink',
+ 'aerial':'cyan',
+ 'shoot':'forestgreen',
+ 'seed':'black',
+ 'leaf':'green',
+ 'root':'gold',
+ 'stem':'orange4'}
+
+
+
+if len(sys.argv) < 2:
+ sys.exit()
+else:
+ query_tf = sys.argv[1]
+
+#tf_tissue_dict = get_tf_tissue(node_file )
+edge_dict, edge_dict_r = get_edge(edge_file)
+write_graphviz_file('result', edge_dict, tissue_colour_dict, agi2name_dict, query_tf)
diff --git a/Code/make_parameter_bw.py b/Code/make_parameter_bw.py
new file mode 100644
index 0000000..bfb7074
--- /dev/null
+++ b/Code/make_parameter_bw.py
@@ -0,0 +1,127 @@
+# Usage: python make_parameter_bw.py
+# Edit the variable BW_PATHS, which is a file containing (almost all) paths to in-house bw files. Edit PARENT_DIR, which
+# will be used to make full path to the bw files. Amend name_map_dict.
+# Purpose
+# -------
+# Make a parameter file for buildCmatrix.py. For example:
+#
+# @C0001100007141
+# PROTEIN_ID:
+# PROTEIN_NAME:HTR13
+# DATA_NAME:2_71C_10_IP-HTR13-17c
+# DATA_FORMAT:bw
+# DESCRIPTION:in house ChIP data
+# LOCATION:/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data/2_71C/20150603_H3_2nd_rep_mapped_bw/ChIP-seq_h31_h33_col_rep1_20150607_max_fragment_500_bw/2_71C_10_IP-HTR13-17c_raw_trimmo_paired_truseq3-PE-2_2_10_5_1_bowtie2_TAIR10_ensembl_nomixed_sorted_rmdup_picard_genomenorm.bw
+# NOTE:
+#
+# 29 NOV 2016, hui, home
+
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+BW_PATHS = 'bwfiles_unique.txt' # Ideally, the input file should be sorted by library number, then sample number.
+PARENT_DIR = '/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data'
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+def get_library_id(s):
+ index = s.find('C') # in-house chip library id ends with a letter C
+ if index > 0:
+ return s[:index]
+ else:
+ return s
+
+def get_bw_file_name(lst):
+ for x in lst:
+ if '.bw' in x:
+ return x
+ return ''
+
+def get_sample_name(s):
+ index = s.find('_raw') # a bw file name contains _raw, get the part before _raw if _raw exists
+ if index > 0:
+ return s[:index]
+ else:
+ return s
+
+def get_sample_number(s):
+ index = s.find('_S') # sample number is proceeded by _S
+ if index > 0:
+ if s[index+2].isdigit() and s[index+3].isdigit(): # two digit sample number
+ return s[index+2:index+4]
+ else: # one digit sample number
+ return '0' + s[index+2]
+ return '25'
+
+def convert_name(s, d):
+ ''' convert s to gene id '''
+ if s.upper() in d:
+ return d[s.upper()]
+ else:
+ return ' '
+
+def get_gene_id_and_name(s, d):
+ ''' Return a dictionary value, protein name and protein id given a sample file name.'''
+ for k in sorted(d.keys(),reverse=True):
+ if k.isdigit() and k.lower() in s.lower(): # k is a number, e.g., 833
+ return (d[k], convert_name(d[k], agi2name_dict))
+ if k.lower() in s.lower():
+ return (k, convert_name(k, agi2name_dict))
+ return ('name_unknown', 'id_unknown')
+
+###
+
+# key is a name of sample, value is either empty if it is a protein name, or the protein name if the key is a database number, e.g., 833. If key is protein name, the program tries to find its gene id (AT...). If key is a database number, the program gets its protein name and then tries to find its gene id. If not found, PROTEIN_ID or PROTEIN_NAME will be assigned _unknown. So search _unknown to manually annotate PROTEIN_ID or PROTEIN_NAME.
+name_map_dict = {'HTA11':'', 'H3':'', 'HTR13':'', 'HSP70':'', 'KIN10':'', 'EPM1':'', 'ELF3':'', 'phyB':'', '833':'KIN10', 'hos':'', 'HOS':'', 'HOS1':'', 'LUX':'', 'YHB':'','HSF1':'','3129':'HSP90','838':'phyB', '1506':'HSF1', 'SUMO':'', 'TIR1':'', 'PIF4':'', 'PIF':'','H2A':'', 'H2AZ':'', 'MNase':'', '544':'ELF4', '834':'PIF4', '745':'ELF3', 'EC1_S1':'', 'EC1_S1':'ELF3', 'EC2_S2':'ELF3', '1166':'LUX', '1167':'LUX', '3239':'REV', '1281':'MPK6', '1278':'SEP3', '1279':'FD', '1280':'FD-like', '1283':'HSP70', '1284':'DELLA', '1762':'FT', 'LFY':'', 'TFL1':''}
+
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+f = open(BW_PATHS)
+
+d_cid = {}
+for line in f:
+ line = line.strip()
+ lst = line.split('/')
+ bwfile = get_bw_file_name(lst)
+ sample_name = get_sample_name(bwfile)
+ sample_no = get_sample_number(bwfile)
+ library_id = get_library_id(lst[0])
+ name, gid = get_gene_id_and_name(sample_name, name_map_dict)
+ path = os.path.join(PARENT_DIR, line)
+
+ if '2_' in library_id: # some in-house library id starts with 2_, indicating that it is a replicate.
+ index = library_id.find('_')
+ library_id = library_id[index+1:]
+ cid = '@C00011000%s%s' % (library_id.zfill(3), sample_no) # the highest digit of the last 9 digits is 1 to indicate that it is a replicate. library number, 3 digits. sample number, 2 digits.
+ else:
+ cid = '@C00010000%s%s' % (library_id.zfill(3), sample_no)
+
+ # handle duplicate library id
+ if not cid in d_cid:
+ d_cid[cid] = 1
+ else: # produce a new library id
+ d_cid[cid] += 1
+ count = 26
+ head = cid[-2:] # last two digits
+ while True:
+ new_cid = cid[:-2] + '%02d' % (count)
+ if count >= 100:
+ print('Error: count must be less than 100.')
+ sys.exit()
+ count += 1
+ if not new_cid in d_cid:
+ break
+ cid = new_cid
+ d_cid[cid] = 1
+
+ # print contents for the parameter file
+ print(cid)
+ print('PROTEIN_ID:%s' % (gid))
+ print('PROTEIN_NAME:%s' % (name))
+ print('DATA_NAME:%s' % (sample_name))
+ print('DATA_FORMAT:%s' % ('bw'))
+ print('DESCRIPTION:in house ChIP data')
+ print('LOCATION:%s' % (path))
+ print('NOTE:')
+ print('')
+
+f.close()
diff --git a/Code/make_parameter_dapseq2.py b/Code/make_parameter_dapseq2.py
new file mode 100644
index 0000000..946ab6f
--- /dev/null
+++ b/Code/make_parameter_dapseq2.py
@@ -0,0 +1,57 @@
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+DAPSEQ_DIR = '../Data/C/Mapped/dapseq/peaks'
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+
+def make_dapseq_dictionary(dirname):
+
+ d = {}
+
+ files = glob.glob(os.path.join(dirname, '*/*/*/*.narrowPeak'))
+
+ for f in files:
+ lst = f.split('/')
+ tf_name = lst[-3]
+ if not tf_name in d:
+ d[tf_name] = f
+ else:
+ print('ERROR: transcription factor name not unique.')
+ sys.exit()
+
+ return d
+
+
+d = make_dapseq_dictionary(DAPSEQ_DIR)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+
+count = 1
+for k, v in sorted(d.items(), key=operator.itemgetter(0)):
+ cid = 'C0002%09d' % (count)
+ count += 1
+ print('@%s' % (cid))
+ g = k.split('_')[0]
+ g = g.upper()
+
+
+ if g.startswith('AT'):
+ print('PROTEIN_ID:%s' % (g))
+ elif g in agi2name_dict and g != agi2name_dict[g]:
+ print('PROTEIN_ID:%s' % (agi2name_dict[g]))
+ else:
+ print('PROTEIN_ID:%s' % (g))
+
+ if g.startswith('AT') and g in agi2name_dict and g != agi2name_dict[g]:
+ print('PROTEIN_NAME:%s' % (agi2name_dict[g]))
+ else:
+ print('PROTEIN_NAME:%s' % (g))
+
+ print('DATA_NAME:%s' % (k))
+ print('DATA_FORMAT:%s' % ('narrowPeak'))
+ print('DESCRIPTION:dapseq')
+ print('LOCATION:%s' % (v))
+ print('NOTE:')
+ print('')
diff --git a/Code/make_parameter_dapseq3.py b/Code/make_parameter_dapseq3.py
new file mode 100644
index 0000000..405b3dc
--- /dev/null
+++ b/Code/make_parameter_dapseq3.py
@@ -0,0 +1,75 @@
+# Usage: python make_parameter_dapseq3.py
+# Because dap-seq dose not include all TFs, so include other TFs. The idea is that TFs within a same family are very conservative in binding.
+#
+
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+DAPSEQ_DIR = '/home/hui/network/dapseq_merged'
+MAP_FILE = '/home/hui/network/dapseq_merged/tffamily.simple.txt'
+
+def get_name(s):
+ lst = s.split('_')
+ result = []
+ for x in lst:
+ if x != 'tnt':
+ result.append(x)
+ return '_'.join(result)
+
+def make_dapseq_dictionary(dirname):
+
+ d = {}
+
+ files = glob.glob(os.path.join(dirname, '*.narrowPeak'))
+
+ for f in files:
+ lst = f.split('/')
+ tf_name = lst[-1].split('.')[0]
+ tf_name = get_name(tf_name)
+ if not tf_name in d:
+ d[tf_name] = f
+ else:
+ print('ERROR: transcription factor name not unique.')
+ sys.exit()
+
+ return d
+
+
+d = make_dapseq_dictionary(DAPSEQ_DIR)
+
+f = open(MAP_FILE)
+lines = f.readlines()
+f.close()
+
+# since MAP_FILE contain duplicate lines
+d_family = {}
+for line in lines:
+ line = line.strip()
+ lst = line.split()
+ tf = lst[0].upper()
+ tf_name = lst[1]
+ family = lst[2]
+ if not tf in d_family:
+ d_family[tf] = (tf_name, family)
+ else:
+ if family != d_family[tf][1]:
+ print('WARNING: %s conflict [%s %s]!' % (tf, family, d_family[tf][1]))
+
+count = 1
+for k in sorted(d_family.keys()):
+ g = k
+ gname = d_family[k][0]
+ key = d_family[k][1]
+ if key in d:
+ cid = 'C0003%09d' % (count)
+ count += 1
+ print('@%s' % (cid))
+ print('PROTEIN_ID:%s' % (g))
+ print('PROTEIN_NAME:%s' % (gname))
+ print('DATA_NAME:%s' % (gname))
+ print('DATA_FORMAT:%s' % ('narrowPeak'))
+ print('DESCRIPTION:inferred from dapseq')
+ #print('LOCATION:%s' % (os.path.join(DAPSEQ_DIR, d[key])))
+ print('LOCATION:%s' % (d[key]))
+ print('NOTE:')
+ print('')
diff --git a/Code/make_parameter_rnaseq.py b/Code/make_parameter_rnaseq.py
new file mode 100644
index 0000000..1fe9c6e
--- /dev/null
+++ b/Code/make_parameter_rnaseq.py
@@ -0,0 +1,163 @@
+# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
+# Edit QUANT_PATH, set NON_ZERO_RATIO.
+#
+# Purpose: automatically generate parameter_for_buildRmatrix.txt
+#
+# Update: 26 Feb 2017, slcu, hui
+# Update: 19 Sep 2019, slcu, hui [add read_ena_data_info_json]
+
+import sys, os, glob, json
+import fnmatch, re
+from configure import RNA_SEQ_INFO_FILE
+
+NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
+QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.
+
+def get_quant_file_list(fname):
+ f = open(fname)
+ result = []
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def extract_id(s, src):
+ if src == 'inhouse' or src == 'other':
+ dirname = os.path.dirname(s)
+ lst = dirname.split('/')
+ parent_dir = lst[-1]
+ libno = filter(str.isdigit, parent_dir) # extrac library number
+ libno = libno.zfill(3)
+ sample_no = os.path.basename(s) # extract sample number
+ first_match = re.findall('_S\d+_', sample_no)[0]
+ sample_no = filter(str.isdigit, first_match).zfill(2)
+ return '0000' + libno + sample_no
+ if src == 'sra':
+ index = s.find('_quant')
+ if index > 0:
+ return s[:index]
+ else:
+ return 'NA'
+ return 'NA'
+
+def zfill2(s, n):
+ return s + 'X' * (n-len(s))
+
+def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
+ if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
+ result = 'R' + success_flag + lab_id + zfill2(myid, 9)
+ else: # inhouse or other
+ result = 'R' + success_flag + lab_id + myid
+ return result
+
+def glob_files_include_path(directory, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for root, dirnames, filenames in os.walk(directory):
+ for filename in fnmatch.filter(filenames, pattern):
+ result.append(os.path.join(root, filename))
+ return result
+
+def glob_files_include_paths(directory_lst, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for directory in directory_lst:
+ result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
+ return result
+
+def non_zero_ratio(fname):
+ non_zero_count = 0
+ total_count = 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ tpm = lst[3]
+ if not tpm == '0' and not 'nan' in tpm:
+ non_zero_count += 1
+ total_count += 1
+ return 1.0 * non_zero_count / total_count
+
+def get_rna_word_count(s):
+ ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
+ count = 0
+ s = s.lower()
+ if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
+ count = 1
+ return count
+
+def read_ena_data_info_json(fname):
+ d = {}
+ with open(fname) as json_data:
+ json_dict = json.load(json_data)
+ for run_id in json_dict:
+ d[run_id] = 1
+ return d
+
+### main
+if not os.path.exists(RNA_SEQ_INFO_FILE):
+ print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
+ sys.exit()
+
+rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)
+
+
+print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head
+
+if len(sys.argv) > 1: # only get these samples specified in id-list.txt
+ quant_files = get_quant_file_list(sys.argv[1])
+else:
+ quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')
+
+
+include_count = 0
+total = 0
+already_added_dict = {}
+for fn in sorted(quant_files):
+ total += 1
+
+ nzr = non_zero_ratio(fn)
+ if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
+ fn2 = os.path.basename(fn)
+ data_name = 'None'
+ if 'inhouse' in fn:
+ myid = extract_id(fn, 'inhouse')
+ myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
+ data_name = 'LAB001' + '_LIB' + myid[-5:]
+ desc = 'in-house'
+ include_me = True
+ elif 'public' in fn:
+ myid = extract_id(fn2, 'sra')
+ myid2 = make_id('0',myid[0:3], myid[3:])
+ data_name = myid
+ desc = 'SRA'
+ include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
+ elif 'other' in fn:
+ myid = extract_id(fn, 'other')
+ # get lab id
+ dirname = os.path.dirname(fn)
+ lst = dirname.split('/')
+ lab_id = lst[-2]
+ myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
+ data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
+ desc = 'Other lab'
+ include_me = True
+ if include_me and not myid2 in already_added_dict:
+ print('@%s' % (myid2))
+ print('DATA_NAME:%s' % (data_name))
+ print('DATA_FORMAT:%s'% ('txt'))
+ print('DESCRIPTION:%s' % (desc))
+ print('LOCATION:%s' % (fn))
+ print('NOTE: non zero ratio is %4.2f' % (nzr))
+ print('')
+ include_count += 1
+ already_added_dict[myid2] = 'yes'
+ else:
+ #print('%s has too many zeroes. ignore.' % (fn))
+ pass
+
+print('#Done. Processed %d files. Included %d files.' % (total, include_count))
diff --git a/Code/make_target_tf.py b/Code/make_target_tf.py
new file mode 100644
index 0000000..1cb5147
--- /dev/null
+++ b/Code/make_target_tf.py
@@ -0,0 +1,290 @@
+# Usage: python make_target_tf.py parameter_for_net.txt > target_tf.txt
+#
+# Purpose: Make a target tfs file: each line is 'Target TF1 Condition.list'.
+# See ../Data/information/target_tf.txt for an example.
+#
+# Created on 17 JAN 2017, hui
+# Last modified on 16 Mar 2017, slcu, hui
+# Last modified on 5 Aug 2019, zjnu, hui
+# Last modified on 9 Oct 2019, zjnu, hui
+# Last modified on 22 Nov 2019, zjnu, hui [include binding information from two sources: target_tf.txt.20170629_143000 (results I made when I was at SLCU between 2016 and 2017) and target_tf_agris.txt]
+
+import sys, os, operator, itertools
+import numpy as np
+from param4net import make_global_param_dict
+
+####################################
+DATA_SYMBOL = '@'
+SIGNAL_INPUT_RATIO_TAU = 1.5
+####################################
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True) # largest numbers on the top
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+
+def read_info_data(fname):
+ ''' Read ChIP-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('make_target_tf: ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_threshold2(lst, glb_param_dict):
+ x = np.array(lst)
+ x = x[x > 0]
+ max_num = int(glb_param_dict['MAX_NUM_TARGETS']) # max number of targets for a protein
+ percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE']) # if we have more targets than the max number, then include this percent of exceeding targets
+ n = len(x)
+ if n < max_num:
+ return x[-1]
+ else: # include some overflowing targets, but not all
+ overflow = n - max_num
+ keep = int(overflow * percent)
+ index = keep + max_num
+ return x[index]
+
+
+def convert_dict(d):
+ '''
+ d = {tf:{cond1:[target1, target2], cond2:[...]}}
+ result = {target:{tf:[c1,c2], tf:[c2,c3]}, ... }
+ '''
+
+ result = {}
+ for k in d: # k is tf
+ vd = d[k] # vd is something like {cond1:[target1, target2], cond2:[...]}
+ for c in vd:
+ lst = vd[c] # a list of targets
+ for x in lst: # x is a target
+ if not x in result:
+ result[x] = {k:[c]}
+ else:
+ if not k in result[x]:
+ result[x][k] = [c]
+ else:
+ result[x][k].append(c)
+
+ return result
+
+
+def get_tf(bind_dict, info_dict, input_dict, glb_param_dict):
+
+ tf_dict = {} # key is TF, value is a list of targets:[target1, target2, target3, ...]
+ #input_cond = input_dict['colid']
+ if len(input_dict) > 0:
+ input_cond = input_dict['colid'][0] # use the first column of INPUT matrix as input (improve). INPUT is used for format BW.
+ else:
+ input_cond = 'NA'
+
+ gene_id = np.array( bind_dict['rowid'] )
+
+ for c in bind_dict['colid']: # check each column (protein) in binding matrix. Find the protein's targets.
+ #print(c)
+ g2 = info_dict[c]['PROTEIN_ID'] # g2 is TF
+ bind_val = np.array( bind_dict['yy'][c] ) # a column of values
+
+ if info_dict[c]['DATA_FORMAT'].upper() == 'BW': # require more consideration in the future
+ input_val = np.array( input_dict['yy'][input_cond] )
+ index = np.logical_and( np.logical_and(input_val > 0, input_val < 10000), (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU) # ignore intensities greater than 10000 as these are definitely noise
+
+ elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict)
+ index = bind_val >= tau
+ else:
+ print('make_target_tf: Data format %s not recognised. Only bw and narrowPeak are valid.' % (info_dict[c]['DATA_FORMAT']))
+ sys.exit()
+
+ target_gene_id = gene_id[index]
+ if g2 != '' and g2 != 'id_unknown':
+ if not g2 in tf_dict:
+ tf_dict[g2] = {c:list(target_gene_id)}
+ else:
+ tf_dict[g2][c] = list(target_gene_id)
+
+ # tf_dict is a bit complicated: key is TF, value is a dictionary
+ # where its key is condition, and value is a list of target genes.
+ # It basically say this TF under condition c binds to a list of
+ # target genes.
+ d = convert_dict(tf_dict)
+ return d
+
+
+def augment_dict(d, target, tf, cond_lst):
+ ''' Enlarge d '''
+ if not target in d:
+ d[target] = {tf:cond_lst}
+ else:
+ if not tf in d[target]:
+ d[target][tf] = cond_lst
+ else:
+ cond_lst.extend(d[target][tf])
+ d[target][tf] = sorted(list(set(cond_lst)))
+
+
+def target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict):
+ ''' Print lines in this format: target TF ChIP-seq conditions, e.g., ../Data/information/target_tf.txt
+ For example, 'AT1G01270 AT3G46640 C0001000008426 C0001000008427 C0001000008428'
+ '''
+ d = get_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
+ # d has the following format {target:{tf1:[c1,c2], tf2:[c2,c3]}, ... }
+
+ # augment d with information from ../Data/information/target_tf_agris.txt and ../Data/information/target_tf.txt.20170629_143000
+ if os.path.exists('../Data/information/target_tf_agris.txt'):
+ f = open('../Data/information/target_tf_agris.txt')
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) == 3:
+ target0 = lst[0]
+ tf0 = lst[1]
+ cond_lst0 = lst[2].split()
+ augment_dict(d, target0, tf0, cond_lst0)
+
+ if os.path.exists('../Data/information/target_tf.txt.20170629_143000'):
+ f = open('../Data/information/target_tf.txt.20170629_143000')
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) == 3:
+ target0 = lst[0]
+ tf0 = lst[1]
+ cond_lst0 = lst[2].split()
+ augment_dict(d, target0, tf0, cond_lst0)
+
+ for target in sorted(d.keys()):
+ tf_d = d[target]
+ if len(tf_d) > 0:
+ for tf in sorted(tf_d.keys()):
+ cond_lst = sorted(list(set(tf_d[tf])))
+ #if len(cond_lst) > 1 and 'C0000000000001' in cond_lst: # C0000000000001 is for binding evidence from agris
+ # cond_lst.remove('C0000000000001')
+ print('%s\t%s\t%s' % (target, tf, ' '.join(cond_lst) ) )
+
+
+########## main ##################################################
+param_file = sys.argv[1] # a single prameter file parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+#print('Read binding data ...')
+bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+
+if os.path.exists(glb_param_dict['INPUT_MATRIX']):
+ input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # for comparing with bw files
+else:
+ input_dict = {}
+
+#print('Make target TF lines ...')
+target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py
new file mode 100644
index 0000000..c96c8fa
--- /dev/null
+++ b/Code/make_target_tf_agris.py
@@ -0,0 +1,39 @@
+# Make target_tf from AtRegNet.txt
+# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
+
+fname = '../Data/information/AtRegNet.txt'
+
+sample_id = 'C0000000000001'
+
+f = open(fname)
+lines = f.readlines()
+f.close()
+
+d = {}
+count = 2
+duplicate = 0
+for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 5:
+ tf0 = lst[1].upper().strip()
+ target0 = lst[4].upper().strip()
+ tf_lst = tf0.split('/')
+ target_lst = target0.split('/')
+ for tf in tf_lst:
+ for target in target_lst:
+ if tf.startswith('AT') and target.startswith('AT'):
+ k = target + '.' + tf
+ if k in d:
+ #print('Warning at line %d ' % (count))
+ duplicate += 1
+ else:
+ d[k] = [target, tf, sample_id]
+ count += 1
+
+
+print('pairs %d' % len(d))
+print('duplicate %d' % (duplicate))
+for k in sorted(d.keys()):
+ print('\t'.join(d[k]))
+
diff --git a/Code/make_upload_chip_parameter.py b/Code/make_upload_chip_parameter.py
new file mode 100644
index 0000000..e6cc4a8
--- /dev/null
+++ b/Code/make_upload_chip_parameter.py
@@ -0,0 +1,233 @@
+# Usage: python make_upload_chip_parameter.py
+#
+# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR.
+# Each unique uploaded file will be assigned an ID.
+# The assigned ID starts with C0000, followed by 9 digits.
+# The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used.
+#
+# TBD: append to PARAMETER_FOR_BUILDCMATRIX
+# Created 20 July 2017, slcu, hui
+
+import os, sys, glob
+from datetime import datetime
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt' # [change]
+UPLOAD_DIR = '../Data/upload/chipseq'
+
+INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME'
+
+def good_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#') and 'STATUS:' in line:
+ if 'SUCC' in line or 'UNKNOWN' in line:
+ return True
+ if 'FAIL' in line:
+ return False
+ return False
+
+def already_included(fname):
+ ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME'''
+ f = open(fname)
+ lines = f.readlines()
+ n = len(lines)
+ f.close()
+ for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included.
+ line = line.strip()
+ if line.startswith('#') and INCLUDE_STAMP in line:
+ return True
+ return False
+
+
+def same_content(f1, f2):
+ ''' Test if two file, f1 and f2, have the same content. '''
+ if os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if os.path.exists(f1) and os.path.exists(f2):
+ a = open(f1)
+ b = open(f2)
+ s1 = ''
+ for line in a:
+ line = line.strip()
+ if not line.startswith('#'): # don't include lines starting with '#'
+ s1 += line
+ s2 = ''
+ for line in b:
+ line = line.strip()
+ if not line.startswith('#'):
+ s2 += line
+ a.close()
+ b.close()
+ if s1 == s2:
+ return True
+ else:
+ return False
+
+def repeat(fname, d):
+ ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. '''
+ for k in d:
+ if same_content(fname, d[k]['LOCATION']):
+ return k
+ return ''
+
+def update_dict(d, k, fname):
+ d[k] = make_chip_info_dict(fname)
+
+# def update_it(upload_dir, upload_dict):
+# id_lst = sorted(upload_dict.keys())
+# if id_lst != []:
+# last_id = id_lst[-1]
+# last_id_number = int(last_id[5:])
+# else:
+# last_id_number = 0
+# for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20.....
+# if good_file(fname) and not already_included(fname):
+# #print(upload_dict)
+# k = repeat(fname, upload_dict)
+# if k == '':
+# k = '%d' % (last_id_number + 1)
+# k = 'C0000' + k.zfill(9)
+# upload_dict[k] = make_chip_info_dict(fname)
+# else:
+# update_dict(upload_dict, k, fname)
+# mark_it_as_included(fname)
+
+
+def make_chip_info_dict(fname):
+ ''' Return a dictionary given a user submitted file. '''
+ d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#'):
+ s = line[(line.rfind('#')+1):]
+ s = s.strip()
+ lst = s.split(':')
+ k = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k] = v
+
+ d['DATA_NAME'] = os.path.basename(fname)
+ d['LOCATION'] = os.path.abspath(fname)
+ d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d')
+ return d
+
+def mark_it_as_included(fname):
+ ''' Prepend a head line including INCLUDE_STAMP'''
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(fname, 'w')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s)
+ f.close()
+
+def make_string(d):
+ s = ''
+ for k in sorted(d.keys()):
+ s += '@%s\n' % k
+ s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID']
+ s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME']
+ s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME']
+ s += 'DATA_FORMAT:narrowPeak\n'
+ s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION']
+ s += 'LOCATION:%s\n' % d[k]['LOCATION']
+ s += 'NOTE:%s\n\n' % d[k]['NOTE']
+ return s
+
+def md(fname):
+ ''' Return a dictionary containing the paramter information. '''
+ d = {}
+ if not os.path.exists(fname):
+ return {}
+ else:
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line != '' and line.startswith('@'):
+ k = line[1:]
+ d[k] = {}
+ elif line != '':
+ lst = line.split(':')
+ k2 = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k][k2] = v
+ return d
+
+def is_empty(fname):
+ ''' Return True if fname has no content. '''
+ if os.path.exists(fname):
+ f = open(fname)
+ s = f.read()
+ f.close()
+ return s.strip() == ''
+ return False
+
+def get_largest_upload_chip_id(fname):
+ lst = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@C0000'):
+ lst.append(int(line[2:]))
+ f.close()
+ if lst != []:
+ return max(lst)
+ else:
+ return 0
+
+def make_upload_dict(param_fname, included_path):
+ d = {}
+ i = get_largest_upload_chip_id(param_fname) + 1 # staring id
+ for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first
+ k = 'C0000' + ('%d' % (i)).zfill(9)
+ if good_file(fn) and not already_included(fn) and repeat(fn, d) == '':
+ d[k] = make_chip_info_dict(fn)
+ i += 1
+ if good_file(fn):
+ mark_it_as_included(fn)
+ cmd = 'mv %s %s' % (fn, included_path)
+ os.system(cmd)
+
+ return d
+
+def append_to_file(fname, s):
+ f = open(fname, 'a')
+ f.write('\n' + s + '\n')
+ f.close()
+
+def make_directory(my_dir):
+ if not os.path.exists(my_dir):
+ os.makedirs(my_dir)
+
+def make_copy(fname):
+ if os.path.exists(fname):
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+ new_fname = fname + '.copy.%s' % (curr_time)
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(new_fname, 'w')
+ f.write(s)
+ f.close()
+
+## main
+included_path = os.path.join(UPLOAD_DIR, 'included')
+make_directory(included_path)
+upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path)
+s = make_string(upload_dict)
+if s != '':
+ # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it
+ make_copy(PARAMETER_FOR_BUILDCMATRIX)
+ append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
new file mode 100644
index 0000000..ef870fb
--- /dev/null
+++ b/Code/merge_edges.py
@@ -0,0 +1,190 @@
+# Purpose: When edges.txt contains multiple lines representing the
+# same edge, choose only one edge.
+#
+# Usage: python merge_edges.py
+#
+# This script is used to produce the edges.txt for the brain
+# web application. It searches in EDGE_POOL_DIR for edge files
+# (with 10 columns) from many sources, most likely with
+# duplicated edges. It removes duplication and computes
+# strength for each edge.
+#
+# Note: make sure fname is edges.txt Rationale: to save place, I am no
+# longer going to use a full list of RNA-seq experiment IDs in the
+# fifth column. Use a number instead. This number is the length of
+# RNA-seq experiment IDs. If no IDs are available, this number is 1.
+# However, I am still going to keep a full list of ChIP-seq experiment
+# IDs (the sixth column).
+#
+# Created on 3 August 2019 by Hui Lan
+# Last modified on 5 August 2019 by Hui Lan
+
+import os, operator, sys, math, datetime, glob
+from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
+
+def get_number_of_RNAseq_ids(s):
+ if s == '.':
+ return 1
+ if s.isdigit():
+ return int(s)
+ return len(s.split())
+
+
+def add_dashes_to_date(s):
+ return s[:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def split_id_and_name(s):
+ lst = s.split()
+ result = lst[0]
+ if lst[0] != lst[1]:
+ result = lst[0] + ' (' + lst[1] + ')'
+ return result
+
+
+def make_html_list(s):
+ if s.strip() == '':
+ return 'None'
+ result = '
'
+ for method in s.split(','):
+ result += '
%s
' % (method)
+ result += '
'
+ return result
+
+
+def make_html_page(lst, fname):
+ tf = lst[1].split()[0] # ID only, no name
+ target = lst[0].split()[0]
+
+ head = '%s\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
+ head += '\n\n\n'
+ s = ''
+ s += '%s\n' % (head)
+ body = '
\n' % (tf, target)
+ body += '\n'
+## if 'AT2G44304' in lst[0] and 'AT2G24700' in lst[1]:
+## print(lst)
+## sys.exit()
+
+ s += '%s\n' % (body)
+ s += ''
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+def compute_time_difference_in_days(t1, t2):
+ ''' t1 and t2 has this format: yyyymmdd. '''
+ if not t1.isnumeric() and length(t1) != 8:
+ raise Exception('t1 format wrong in compute_time_difference_in_days.')
+ if not t2.isnumeric() and length(t2) != 8:
+ raise Exception('t2 format wrong in compute_time_difference_in_days.')
+
+ t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
+ t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
+ return (t1 - t2).days
+
+
+def make_new_edge(lst_tuple):
+ lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
+ best_edge = list( lst[0] )
+
+ # see section 'Ranking edges using frecency' in the brain documentation
+ F = len(lst_tuple)
+
+ RN_lst = []
+ r_lst = []
+ most_recent_edge_date = '00000000'
+ method_or_tissue = []
+ cids = ''
+ for t in lst:
+ r_lst.append( abs(float(t[2])) )
+ rids = t[4]
+ if t[5] > cids:
+ cids = t[5]
+ RN_lst.append( get_number_of_RNAseq_ids(rids) )
+ date = t[7]
+ if date > most_recent_edge_date:
+ most_recent_edge_date = date
+ method_or_tissue.append(t[9])
+ S = 365 * 10
+ curr_date = datetime.datetime.now().strftime('%Y%m%d')
+ #time_diff = int(most_recent_edge_date) - int(curr_date)
+ time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+ strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+ best_edge[4] = '%d' % max(RN_lst)
+ best_edge[5] = cids
+ best_edge[8] = '%.2f' % strength
+ best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
+
+## if 'AT2G44304' in best_edge[0] and 'AT2G24700' in best_edge[1]:
+## print(strength)
+## print(best_edge)
+## sys.exit()
+
+ return best_edge
+
+
+
+##main
+
+d = {}
+duniq = {}
+for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
+ print('[merge_edges.py]: including %s.' % (fname))
+ f = open(fname)
+
+ for line in f:
+ line = line.strip()
+ if len(line.split('\t')) == 10 and not line in duniq:
+ duniq[line] = 1
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ score = lst[2]
+ type_of_score = lst[3]
+ rids = lst[4]
+ cids = lst[5]
+ ll = lst[6]
+ date = lst[7]
+ strength = lst[8]
+ method_or_tissue = lst[9]
+
+ key = target + tf
+ t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
+
+ if not key in d:
+ d[key] = [t]
+ else:
+ d[key].append(t)
+
+ f.close()
+
+
+
+# make html pages
+folder_path = '../Data/temp/html_edges'
+if not os.path.isdir(folder_path):
+ os.mkdir(folder_path)
+
+
+print('[merge_edges.py]: Make text edge file...')
+fout = open(MERGED_EDGE_FILE, 'w')
+for k in d:
+ lst = make_new_edge(d[k])
+ fout.write('\t'.join(lst) + '\n')
+fout.close()
+
+
+print('[merge_edges.py]: Make html edge files. May take a while...')
+for k in d:
+ lst = make_new_edge(d[k])
+ pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
+ make_html_page(lst, folder_path + '/' + pagename)
diff --git a/Code/param4net.py b/Code/param4net.py
new file mode 100644
index 0000000..a68e7e3
--- /dev/null
+++ b/Code/param4net.py
@@ -0,0 +1,25 @@
+# Purpose: refactoring.
+# Create on 10 Aug 2019 by Hui Lan
+
+GLB_PARAM_SYMBOL = '%%'
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_LIST':'', 'HIGH_PRIORITY_GENE':'', 'BINDING_MATRIX':'', 'INPUT_MATRIX':'', 'EXPRESSION_MATRIX':'', 'BINDING_INFO':'', 'EXPRESSION_INFO':'', 'RESEARCH_KEYWORDS':'', 'USER_CONDITION_LIST':[], 'LOOK_FOR_POS_CORRELATION':'NO', 'LOOK_FOR_NEG_CORRELATION':'NO', 'MAKE_PLOT':'NO', 'TWO_WAY':'YES', 'THREE_WAY':'NO', 'TARGET_RANGE':'1000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHRINFO':{'1':30427671, '2':19698289, '3':23459830, '4':18585056, '5':26975502, 'Mt':366924, 'Pt':154478}, 'SELECT_POINTS_DIAGONAL_MAX_DIFF':0.25} # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
new file mode 100644
index 0000000..b9d6905
--- /dev/null
+++ b/Code/parse_ena_xml.py
@@ -0,0 +1,364 @@
+# Usage: python parse_ena_xml.py > rnaseq_info_database.txt
+#
+# Search in this script for 'd_run', 'd_sample', 'd_experiment' and
+# 'd_study', and set their input files. The input files are generated
+# by download_ena_metadata.py (except for d_sample). It also
+# generates a json file called info_database.json, for displaying
+# experimental information in the scatterplot. If the input files are
+# for RNA-seq data, rename info_database.json to
+# rnaseq_info_database.json and move it to Data/information. Also move
+# rnaseq_info_database.txt to Data/information. They are used by
+# html_network.py.
+#
+# Purpose: Get description for RNA-seq data, one for each SRA Run ID.
+# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each
+# line in rnaseq_info_database.txt contains information for a run id.
+#
+# NOTE: you might encounter UnicideEncodeError when running the
+# program. To avoid that, first type this command:
+# export PYTHONIOENCODING=UTF-8.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+# 20 Apr 2017, slcu, hui
+# 30 May 2017, slcu, hui
+# 01 Jun 2017, slcu, hui [added a column sample_id]
+# 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.]
+
+import os, json, re, operator
+import xml.etree.ElementTree
+import sys
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_SraRunTable(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'):
+ lst = line.split('\t')
+ acc = lst[17]
+ if not acc in d:
+ d[acc] = {}
+ d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.'
+ d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.'
+ d[acc]['study_id'] = lst[19] if lst[19] != '' else '.'
+ d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.'
+ d[acc]['alias'] = lst[11] if lst[11] != '' else '.'
+ d[acc]['title'] = lst[20] if lst[20] != '' else '.'
+ return d
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+
+def parse_study(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('PROJECT'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+ if acc != None:
+ d2['secondary_id'] = acc.text
+ else:
+ d2['secondary_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None:
+ d2['title'] = title.text
+
+ run_id = ''
+ for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-')
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_sample(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('SAMPLE'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
+ if acc != None:
+ d2['external_id'] = acc.text
+ else:
+ d2['external_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ tissue_type = ''
+ for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'):
+ if i != None and i.text != None:
+ tissue_type += i.text + ' '
+ d2['tissue'] = tissue_type.strip()
+
+ run_id = ''
+ for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_experiment(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('EXPERIMENT'):
+ d2 = {}
+ d2['primary_id'] = c.get('accession')
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ desc = c.find('./DESIGN/DESIGN_DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ run_id = ''
+ for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'}
+ if w in d:
+ return d[w]
+ return w
+
+def get_tissue(s):
+ ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
+
+
+ lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
+
+ # build a count dictionary, where key is a word
+ d = {}
+ s = s.lower()
+ wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words
+ for w in wlst:
+ if w in lst:
+ w2 = get_singular_form(w)
+ if not w2 in d:
+ d[w2] = 1
+ else:
+ d[w2] += 1
+ if len(d) == 0:
+ return 'unknown'
+
+ tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ result = ''
+ for t in tlst:
+ result += '%s(%d);' % (t[0], t[1])
+ return result.rstrip(';')
+
+
+## main
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_sample = parse_sample('../Data/information/ena_sample.xml') # SAMPLE. We can download RUN, STUDY, EXPERIMENT using download_ena_metadata.py, but not for SAMPLE (weired). So we need manually download ena_sample.xml. Enter http://www.ebi.ac.uk/ena/data/search?query=arabidopsis%20thaliana, click Sample (31,042) in the left panel of the displayed page, then click XML link in the right panel. The XML link is very small, so take time to find it or search for XML.
+d_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
+d_run2 = parse_SraRunTable('../Data/information/SraRunTable_Ath_Tax3702.txt') # Go to https://www.ncbi.nlm.nih.gov/sra. Type (arabidopsis thaliana) AND "Arabidopsis thaliana"[orgn:__txid3702]. Click "Send results to Run selector". Click "RunInfo Table". Save SraRunTable.txt
+d_study = parse_study('../Data/information/ena_rnaseq_read_study.xml') # STUDY
+d_experiment = parse_experiment('../Data/information/ena_rnaseq_read_experiment.xml') # EXPERIMENT
+
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT
+d_run_keys = d_run.keys()
+d_run_keys.extend(d_run2.keys())
+d_run_keys = list(set(d_run_keys))
+for k in sorted(d_run_keys):
+ lst = []
+ lst.append(k)
+ if k in d_run:
+ if k in d_sample:
+ if d_sample[k]['external_id'] != '.':
+ lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue'])
+ else:
+ lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue'])
+ else:
+ lst.append('.')
+ lst.append( d_run[k]['experiment_id'])
+ lst.append( d_run[k]['study_id'] )
+ if k in d_study:
+ lst.append( d_study[k]['primary_id'] )
+ else:
+ lst.append( '.' )
+ lst.append( d_run[k]['title'] )
+ lst.append( d_run[k]['alias'] )
+
+ s = '' # description string
+
+ if k in d_study:
+ s += '
[Study title] ' + d_study[k]['title'] + '
[Study description] ' + d_study[k]['description'] # is used for breaking lines in html
+
+ if k in d_sample:
+ s += '
[Sample title] ' + d_sample[k]['title'] + '
[Sample description] ' + d_sample[k]['description']
+
+ if k in d_experiment:
+ s += '
[Experiment description] ' + d_experiment[k]['description']
+
+ if s == '':
+ s = '.'
+
+ lst.append(s)
+ elif k in d_run2:
+ lst.append(d_run2[k]['sample_id'])
+ lst.append(d_run2[k]['experiment_id'])
+ lst.append(d_run2[k]['study_id'])
+ lst.append(d_run2[k]['study_id_PRJ'])
+ lst.append(d_run2[k]['title'])
+ lst.append(d_run2[k]['alias'])
+ lst.append('.')
+
+ print('%s' % ('\t'.join(lst)))
+
+# make a json file as well. this file is used to display rna-seq information in scatterplots.
+json_dict = {}
+for k in sorted(d_run_keys):
+ if k in d_run:
+ s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:'
+ if k in d_study:
+ s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description']
+ if k in d_sample:
+ s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description']
+ if k in d_experiment:
+ s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description']
+
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ elif k in d_run2:
+ s = d_run2[k]['title'] + ' ' + d_run2[k]['alias']
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ json_dict[k] = d
+
+fname = '../Data/information/rnaseq_info_database.json'
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname))
diff --git a/Code/parse_ena_xml_test.py b/Code/parse_ena_xml_test.py
new file mode 100644
index 0000000..c12c580
--- /dev/null
+++ b/Code/parse_ena_xml_test.py
@@ -0,0 +1,307 @@
+# Usage: python parse_ena_xml.py > rnaseq_info_database.txt
+#
+# Search in this script for 'd_run', 'd_sample', 'd_experiment' and
+# 'd_study', and set their input files. The input files are generated
+# by download_ena_metadata.py (except for d_sample). It also
+# generates a json file called info_database.json, for displaying
+# experimental information in the scatterplot. If the input files are
+# for RNA-seq data, rename info_database.json to
+# rnaseq_info_database.json and move it to Data/information. Also move
+# rnaseq_info_database.txt to Data/information. They are used by
+# html_network.py.
+#
+# Purpose: Get description for RNA-seq data, one for each SRA Run ID.
+# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each
+# line in rnaseq_info_database.txt contains information for a run id.
+#
+# NOTE: you might encounter UnicideEncodeError when running the
+# program. To avoid that, first type this command:
+# export PYTHONIOENCODING=UTF-8.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+# 20 Apr 2017, slcu, hui
+# 30 May 2017, slcu, hui
+# 01 Jun 2017, slcu, hui [added a column sample_id]
+# 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.]
+
+import os, json, re, operator
+import xml.etree.ElementTree
+import sys
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_SraRunTable(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'):
+ lst = line.split('\t')
+ acc = lst[17]
+ if not acc in d:
+ d[acc] = {}
+ d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.'
+ d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.'
+ d[acc]['study_id'] = lst[19] if lst[19] != '' else '.'
+ d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.'
+ d[acc]['alias'] = lst[11] if lst[11] != '' else '.'
+ d[acc]['title'] = lst[20] if lst[20] != '' else '.'
+ return d
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+
+def parse_study(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('PROJECT'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+ if acc != None:
+ d2['secondary_id'] = acc.text
+ else:
+ d2['secondary_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None:
+ d2['title'] = title.text
+
+ run_id = ''
+ for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-')
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_sample(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('SAMPLE'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
+ if acc != None:
+ d2['external_id'] = acc.text
+ else:
+ d2['external_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ tissue_type = ''
+ for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'):
+ if i != None and i.text != None:
+ tissue_type += i.text + ' '
+ d2['tissue'] = tissue_type.strip()
+
+ run_id = ''
+ for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_experiment(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('EXPERIMENT'):
+ d2 = {}
+ d2['primary_id'] = c.get('accession')
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ desc = c.find('./DESIGN/DESIGN_DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ run_id = ''
+ for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'}
+ if w in d:
+ return d[w]
+ return w
+
+def get_tissue(s):
+ ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
+
+
+ lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
+
+ # build a count dictionary, where key is a word
+ d = {}
+ s = s.lower()
+ wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words
+ for w in wlst:
+ if w in lst:
+ w2 = get_singular_form(w)
+ if not w2 in d:
+ d[w2] = 1
+ else:
+ d[w2] += 1
+ if len(d) == 0:
+ return 'unknown'
+
+ tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ result = ''
+ for t in tlst:
+ result += '%s(%d);' % (t[0], t[1])
+ return result.rstrip(';')
+
+
+## main
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_run = parse_run('ena_rnaseq_read_run.xml') # RUN
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d_run_keys = d_run.keys()
+d_run_keys = list(set(d_run_keys))
+for k in sorted(d_run_keys):
+ lst = []
+ lst.append(k)
+ if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower():
+ lst.append( d_run[k]['experiment_id'])
+ lst.append( d_run[k]['study_id'] )
+ lst.append( d_run[k]['title'] )
+ lst.append( d_run[k]['alias'] )
+ print('\t'.join(lst))
+
+# make a json file as well. this file is used to display rna-seq information in scatterplots.
+json_dict = {}
+for k in sorted(d_run_keys):
+ if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower():
+ s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:'
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ json_dict[k] = d
+
+fname = 'rnaseq_info_database.json'
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname))
diff --git a/Code/prepare_gene_file.py b/Code/prepare_gene_file.py
new file mode 100644
index 0000000..febcbef
--- /dev/null
+++ b/Code/prepare_gene_file.py
@@ -0,0 +1,79 @@
+# Usage: python prepare_gene_file.py all-ath-gene-position.txt > gene_file.txt
+# all-ath-gene-position contains all gene IDs in the genome, and their positions, orientation, etc, in BED format.
+# See ../Data/information/all-ath-gene-position.txt
+# Purpose: get gene name and gene annotation
+# 2 JAN 2017 hui SLCU
+
+import sys
+import os
+
+###################################################################################
+GENE_DESCRIPTION = '../Data/information/gene_description_20140101.txt'
+AGI_TO_GENE_NAMES = '../Data/information/AGI-to-gene-names.txt'
+###################################################################################
+
+
+def get_description(x, d):
+ result = ''
+ if x in d:
+ result = '\t' + d[x]
+ else:
+ result = '\tNot Found'
+ return result
+
+
+def make_AGI_to_gene_name_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ agi = lst[0]
+ name = lst[1]
+ if not agi in d:
+ d[agi] = name
+ else:
+ d[agi] += ';' + name
+ f.close()
+ return d
+
+
+## main
+
+# make a dictionary of gene description
+f0 = open(GENE_DESCRIPTION)
+d = {}
+for line in f0:
+ line = line.strip()
+ lst = line.split('\t')
+ id = lst[0]
+ id = id[0:9] # AGI id, omit .1, .2, .3, etc
+ s = '\t'.join(lst[1:])
+ if not id in d:
+ d[id] = s
+ else:
+ d[id] += '\t' + s
+
+f0.close()
+
+agi2genename_dict = make_AGI_to_gene_name_dict(AGI_TO_GENE_NAMES)
+
+locus_file = sys.argv[1] # location of genes
+f = open(locus_file) # see all-ath-gene-position.txt
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ x = lst[3]
+ c = lst[0]
+ ss = lst[1]
+ ee = lst[2]
+ strand = lst[5]
+ result = [x]
+ if x in agi2genename_dict and not x == agi2genename_dict[x]:
+ result.append(agi2genename_dict[x])
+ else:
+ result.append(' ') # if no gene name, use a space
+ result.extend([c, ss, ee, strand, get_description(x, d)])
+ print('\t'.join(result))
+
+f.close()
diff --git a/Code/process_3way_interaction.py b/Code/process_3way_interaction.py
new file mode 100644
index 0000000..0be9b59
--- /dev/null
+++ b/Code/process_3way_interaction.py
@@ -0,0 +1,48 @@
+# Purpose: convert results from three-way interaction analysis to edge format.
+# Usage: python process_3way_interaction.py output20160911.txt > edges.txt.interaction.wiggelab.timecourse
+# Create on 9 Aug 2019 by Hui Lan
+
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+def get_2tf_1target_1score(s):
+ '''
+ s looks like 'AT1G73870_AT1G73870, AT5G10570_AT5G10570, AT2G05100_LHCB2.1 19.287 | 0.843 0.998 0.155 | -0.915 0.924 1.839 | 0.918 -0.419'
+ '''
+ lst = s.split()
+ tf1 = lst[0].split('_')[0]
+ tf2 = lst[1].split('_')[0]
+ target = lst[2].split('_')[0]
+ score = lst[3]
+ return (tf1, tf2, target, score)
+
+
+## main
+import sys
+from datetime import datetime
+
+f = open(sys.argv[1])
+lines = f.readlines()
+f.close()
+
+
+agi2name_dict = make_gene_name_AGI_map_dict('../Data/information/AGI-to-gene-names_v2.txt')
+
+result = ''
+for line in lines:
+ line = line.strip()
+ tf1, tf2, target, interaction_score_str = get_2tf_1target_1score(line)
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf1_str = tf1 + ' ' + get_gene_name(tf1, agi2name_dict)
+ tf2_str = tf2 + ' ' + get_gene_name(tf2, agi2name_dict)
+ score_str = '0.5'
+ cond_str = '.'
+ curr_date = datetime.now().strftime('%Y%m%d')
+ method_or_tissue = 'interact.with.%s' % (tf2 + '(' + get_gene_name(tf2, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf1_str, score_str, 'mix', '15', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+ method_or_tissue = 'interact.with.%s' % (tf1 + '(' + get_gene_name(tf1, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf2_str, score_str, 'mix', '15', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+
+
+print(result)
diff --git a/Code/process_3way_interaction2.py b/Code/process_3way_interaction2.py
new file mode 100644
index 0000000..7ce26b3
--- /dev/null
+++ b/Code/process_3way_interaction2.py
@@ -0,0 +1,58 @@
+# Purpose: convert results from three-way interaction analysis to edge format.
+# Usage: python process_3way_interaction.py ../Data/information/summary.txt > edges.txt.interaction.seeddata
+# Create on 9 Aug 2019 by Hui Lan
+
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+def get_2tf_1target_1score(s):
+ '''
+ s looks like '398: ( AT3G10490;AT3G10480, AT1G03970, AT5G20910 ) 10.41 2'
+ '''
+
+ lst = s.split()
+ tf1_all = lst[2]
+ tf2_all = lst[3]
+ target = lst[4]
+ score = lst[6]
+ result = []
+ for tf1 in tf1_all.split(';'):
+ tf1 = tf1.replace(',', '')
+ for tf2 in tf2_all.split(';'):
+ tf2 = tf2.replace(',', '')
+ result.append((tf1, tf2, target, score))
+
+ return result
+
+## main
+import sys
+from datetime import datetime
+
+f = open(sys.argv[1])
+lines = f.readlines()
+f.close()
+
+
+agi2name_dict = make_gene_name_AGI_map_dict('../Data/information/AGI-to-gene-names_v2.txt')
+
+result = ''
+for line in lines[2:]:
+ line = line.strip()
+ lst = get_2tf_1target_1score(line)
+ for t in lst:
+ tf1, tf2, target, interaction_score_str = t
+ if tf1.startswith('AT') and tf2.startswith('AT') and target.startswith('AT'):
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf1_str = tf1 + ' ' + get_gene_name(tf1, agi2name_dict)
+ tf2_str = tf2 + ' ' + get_gene_name(tf2, agi2name_dict)
+ score_str = '0.6'
+ cond_str = '.'
+ curr_date = datetime.now().strftime('%Y%m%d')
+ method_or_tissue = 'interact.with.%s' % (tf2 + '(' + get_gene_name(tf2, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf1_str, score_str, 'mix', '138', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+ method_or_tissue = 'interact.with.%s' % (tf1 + '(' + get_gene_name(tf1, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf2_str, score_str, 'mix', '138', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+
+
+print(result)
diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py
new file mode 100644
index 0000000..8bc111c
--- /dev/null
+++ b/Code/refine_tissue.py
@@ -0,0 +1,302 @@
+# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt
+# Set cmd =
+#
+# Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import os, sys, operator
+import string
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip',
+ 'epidermis including guard cells':'epidermis',
+ 'root tip tissue':'root.tip',
+ 'anther stage 4-7':'flower.anther',
+ 'anther':'flower.anther',
+ 'embryo':'seed.embryo',
+ 'etiolated seedlings':'seedling.etiolated',
+ '21 days-old seedlings':'seedling',
+ 'aerial tissue':'aerial.tissue',
+ 'endosperm':'seed.endosperm',
+ 'whole seed':'seed',
+ 'pistils pollinated for 8 hours':'flower.pistil.pollinated',
+ 'primary root':'root',
+ 'whole floral bud':'flower.bud',
+ 'whole seedling root':'seedling.root',
+ 'whole root':'seedling.root',
+ 'whole plants':'seedling',
+ 'aerial shoots':'shoot',
+ 'flower bud':'flower.bud',
+ 'aerial seedling':'seedling.aerial',
+ 'anthers at stage 4-7':'flower.anther',
+ 'carpels (collected manually from 15 developing inflorescences)':'flower.carpel',
+ 'ath_shoot_meristem_1':'shoot.meristem',
+ 'ath_whole_plant_1':'seedling',
+ 'ath_whole_plant_2':'seedling',
+ 'whole seeds':'seed',
+ '3-day-old root':'root',
+ 'unopened flower buds':'flower.bud',
+ 'first true leaf':'leaf',
+ '3-day-old root':'root',
+ '7 dag seedlings':'seedling',
+ 'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts',
+ 'root tip':'root.tip',
+ 'inflorescences and siliques':'inflorescences.and.siliques',
+ 'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis',
+ 'base stem':'stem',
+ 'siliques':'silique',
+ 'whole organism':'seedling',
+ 'seedling shoot':'seedling.shoot',
+ 'aerial tissue':'aerial.tissue',
+ '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence',
+ 'shoot apical meristem':'shoot.apical.meristem',
+ 'expanded mature leaves from 28 day old plants':'leaf',
+ 'aerial tissues of 15 day seedlings': 'aerial.tissue',
+ 'whole parts':'seedling',
+ 'aerial organs':'aerial.tissue',
+ 'lower stem':'stem',
+ 'upper stem':'stem',
+ 'rosette':'leaf.rosette',
+ 'root and shoot':'root.and.shoot',
+ 'cell culture':'cell.culture',
+ 'aerial part':'aerial.tissue',
+ 'aerial':'aerial.tissue',
+ 'whole plantlet without root':'seedling',
+ 'sorted endodermis (facs)':'endodermis.facs-sorted',
+ 'whole root':'root',
+ 'siluge without seeds':'seed',
+ 'first internode':'stem',
+ 'rosettes':'leaf.rosette',
+ 'hypocotyl':'seedling.hypocotyl',
+ 'somatic embryo':'seed.embryo'
+ }
+ if w in d:
+ return d[w]
+ return w
+
+def remove_parenthese(s):
+ if '(' in s:
+ return s[:s.find('(')]
+ return s
+
+
+
+def make_singular(lst):
+ result = []
+ # map plural to singular
+ d = {'roots':'root', 'shoots':'shoot',
+ 'leaves':'leaf', 'flowers':'flower',
+ 'anthers':'anther', 'hairs':'hair',
+ 'seedlings':'seedling', 'apices':'apex',
+ 'buds':'bud', 'siliques':'silique',
+ 'rosettes':'rosette', 'meristems':'meristem',
+ 'sepals':'sepal', 'petals':'petal',
+ 'inflorescences':'inflorescence', 'carpels':'carpel',
+ 'seeds':'seed', 'pistils':'pistil',
+ 'stamens':'stamen', 'ovules':'ovule',
+ 'tissues':'tissue', 'ovaries':'ovary',
+ 'veins':'vein', 'nodes':'node',
+ 'internodes':'internode', 'fibres':'fibre',
+ 'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon',
+ 'plants':'plant', 'embryos':'embryo'}
+
+ for x in lst:
+ if x in d:
+ result.append(d[x])
+ else:
+ result.append(x)
+ return result
+
+def map_tissue(s):
+ ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. '''
+ d = {
+ 'hypocotyl':'seedling.hypocotyl',
+ 'hypocotyl seedling':'seedling.hypocotyl',
+ 'leaf':'leaf',
+ 'leaf petiole':'leaf.petiole',
+ 'petiole':'leaf.petiole',
+ 'leaf blade':'leaf.blade',
+ 'leaf first true':'leaf',
+ 'leaf stomata':'leaf.stomata',
+ 'stomata':'leaf.stomata',
+ 'chlorophyll':'leaf.chlorophyll',
+ 'vein':'leaf.vein',
+ 'leaf vein':'leaf.vein',
+ 'leaf lamina':'leaf.lamina',
+ 'leaf rosette':'leaf rosette',
+ 'rosette':'leaf.rosette',
+ 'rosette leaf':'leaf.rosette',
+ 'shoot':'shoot',
+ 'aerial shoot':'aerial.shoot',
+ 'shoot apex':'shoot.apex',
+ 'shoot tip':'shoot.apex',
+ 'flower':'flower',
+ 'flower petal':'flower.petal',
+ 'flower sepal':'flower.sepal',
+ 'flower stamen':'flower.stamen',
+ 'flower anther':'flower.anther',
+ 'flower carpel':'flower.carpel',
+ 'flower pistil':'flower.pistil',
+ 'flower inflorescence':'flower.inflorescence',
+ 'stigma':'flower.stigma',
+ 'filament':'flower.filament',
+ 'style':'flower.style',
+ 'anther':'flower.anther',
+ 'petal':'flower.petal',
+ 'sepal':'flower.sepal',
+ 'stamen':'flower.stamen',
+ 'carpel':'flower.carpel',
+ 'pistil':'flower.pistil',
+ 'ovary':'flower.ovary',
+ 'pedicel':'flower.pedicel',
+ 'ovule':'flower.ovule',
+ 'inflorescence':'flower.inflorescence',
+ 'seed':'seed',
+ 'epicotyl':'seed.epicotyl',
+ 'radicle':'seed.radicle',
+ 'embryo':'seed.embryo',
+ 'endosperm':'seed.endosperm',
+ 'endodermis':'endodermis',
+ 'stem':'stem',
+ 'pith':'pith',
+ 'protoxylem':'protoxylem',
+ 'xylem':'xylem',
+ 'phloem':'phloem',
+ 'sclerenchyma':'sclerenchyma',
+ 'bast fibre':'bast.fibre',
+ 'cortex':'cortex',
+ 'parenchyma':'parenchyma',
+ 'mesophyll':'leaf.mesophyll',
+ 'shoot apical meristem':'meristem.shoot.apical',
+ 'root apical meristem':'.meristem.root.apical',
+ 'apical meristem':'meristem.apical',
+ 'floral meristem':'meristem.floral',
+ 'inflorescence meristem':'meristem.inflorescence',
+ 'meristem':'meristem',
+ 'meristem shoot':'meristem.shoot',
+ 'cotyledon':'cotyledon',
+ 'apical':'apical',
+ 'basal':'basal',
+ 'root':'root',
+ 'root apex':'root.apex',
+ 'root tip':'root.tip',
+ 'root primary tip':'root.primary.tip',
+ 'root cap':'root.cap',
+ 'root lateral':'root.lateral',
+ 'root primary':'root.primary',
+ 'root hair':'root.hairs',
+ 'bud':'bud',
+ 'bud axillary':'bud.axillary',
+ 'bud lateral':'bud.axillary',
+ 'bud apical':'bud.apical',
+ 'bud floral':'bud.flower',
+ 'bud flower':'bud.flower',
+ 'bud meristem':'bud.meristem',
+ 'internode':'stem.internode',
+ 'node':'stem.node',
+ 'vascular':'vasculum',
+ 'epidermis':'epidermis',
+ 'seedling':'seedling',
+ 'plant':'seedling',
+ 'whole plant':'seedling',
+ 'whole':'seedling',
+ 'whole parts':'seedling',
+ 'whole root':'root',
+ 'seedling root':'seedling.root',
+ 'seedling shoot':'seedling.shoot',
+ 'seedling etiolated':'seedling.etiolated',
+ 'aerial':'aerial',
+ 'aerial tissue':'aerial.tissue',
+ 'aerial seedling':'seedling.aerial',
+ 'silique':'silique',
+ 'unknown':'unknown',
+ 'siluge':'seed',
+ 'bundle sheath':'leaf'
+ }
+ result = [] # a list of tuples, (tissue, word count)
+ s = s.lower()
+ slst = s.split()
+ slst2 = make_singular(slst)
+ for k in d: # search each key in d
+ klst = k.split()
+ count = 0
+ exact_count = 0
+ for x in klst:
+ count += slst2.count(x)
+ if x in slst2:
+ exact_count += 1
+ if count >= len(klst) and exact_count == len(klst):
+ result.append((d[k], count))
+ if result == []:
+ return 'unknown'
+ else:
+ sresult = sorted(result, key=operator.itemgetter(1), reverse=True)
+ return sresult[0][0]
+
+
+def repeat_words(s):
+ ''' s in the form of meristem(2) '''
+ s = s.strip()
+ index = s.find('(')
+ if index < 0:
+ return s
+ index2 = s.find(')')
+ word = s[:index]
+ n = s[(index+1):index2]
+ n = int(n)
+ return ' '.join(n*[word])
+
+def get_words(s):
+ ''' s in the form meristem(2);leaf(2);bud(1) or shoot.meristem '''
+ lst = s.split(';')
+ result = []
+ for x in lst:
+ index = x.find('(')
+ if index >= 0:
+ t = repeat_words(x)
+ result.append(t)
+ else:
+ t = x
+ if '.' in t:
+ for y in t.split('.'):
+ result.append(y)
+ return ' '.join(result)
+
+def remove_punctuation(s):
+ return s.replace('_', ' ')
+
+# main
+
+if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'):
+ cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py
+ os.system(cmd)
+else:
+ print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.')
+ sys.exit()
+
+f = open('../Data/temp/a.txt')
+print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue')
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+
+ if lst[2] != 'part_unknown' and lst[2] != '.':
+ s = lst[2].lower()
+ s = map_tissue(remove_punctuation(s))
+ # elif lst[2] == '.':
+ # s = lst[1]
+ # s = remove_parenthese(s)
+ else:
+ s = lst[1]
+ if not ';' in s:
+ s = remove_parenthese(s)
+ else:
+ s = get_words(s)
+
+ s = map_tissue(remove_punctuation(s))
+
+ print(line + '\t' + s)
+f.close()
diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py
new file mode 100644
index 0000000..7d955eb
--- /dev/null
+++ b/Code/rnaseq_or_chipseq.py
@@ -0,0 +1,92 @@
+# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
+#
+# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
+#
+# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
+#
+# 13 Apr 2017, slcu, hui
+
+import os, json, re, operator, sys
+import xml.etree.ElementTree
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+def get_key(s):
+ if '_' in s:
+ return s[:s.find('_')]
+ else:
+ return s[:s.find('.')]
+
+def make_downloaded_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ if line.startswith('-'):
+ fn = line.split(' ')[-1]
+ else:
+ fn = os.path.basename(line)
+ k = get_key(fn)
+ if not k in d:
+ d[k] = [fn]
+ else:
+ d[k].append(fn)
+ d[k] = sorted(list(set(d[k])))
+ return d
+
+## main
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d = make_downloaded_dict(sys.argv[1])
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
+d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN
+
+for k in sorted(d.keys()):
+ s = k + '\t' + ' '.join(d[k])
+ if k in d_rnaseq_run:
+ s += '\t' + 'RNA-seq'
+ elif k in d_chipseq_run:
+ s += '\t' + 'ChIP-seq'
+ else:
+ s += '\t' + '.'
+ print(s)
diff --git a/Code/slice_TPM_to_JSON.py b/Code/slice_TPM_to_JSON.py
new file mode 100644
index 0000000..e597b78
--- /dev/null
+++ b/Code/slice_TPM_to_JSON.py
@@ -0,0 +1,164 @@
+# Usage: python slice_TPM_to_JSON.py parameter_for_net.txt
+#
+# Purpose: Given the matrix TPM.txt, make logarithmised gene
+# expression in json format for each gene. Put the results in
+# JSON_DIR. The results are used for displaying scatterplots in
+# Webapp.
+#
+# Last modified 24 Apr 2017, slcu, hui [use r to do the job, faster]
+
+import sys, os, operator, itertools
+import numpy as np
+import json
+
+JSON_DIR = '../Data/history/expr/json' # contain json for all genes, one json file for each gene. Each json file has the following format {"R0ERR046550XXX": 2.8148097376737438, "R0ERR031542XXX": 2.5193080765053328, ...}
+
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+
+# read expression TPM
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ # d4_sorted = {}
+ # for k in d4:
+ # d4_sorted[k] = sorted(d4[k], reverse=True)
+ # d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+
+def take_log(x):
+ return np.log(x+1)
+
+
+def make_json_file(expr_dict, dir_name, glb_param_dict):
+ if not os.path.isdir(dir_name): # create the directory if not exist
+ os.makedirs(dir_name)
+
+ d = expr_dict['xy']
+ col_name_lst = expr_dict['colid']
+ row_name_lst = expr_dict['rowid']
+ for g in row_name_lst:
+ #print(g)
+ d2 = d[g]
+ if glb_param_dict['LOGRITHMIZE'].upper() == 'YES':
+ d3 = {k: take_log(v) for k, v in d2.items()}
+ else:
+ d3 = d2
+ filename = os.path.join(dir_name, g + '.json')
+ with open(filename, 'w') as f:
+ json.dump(d3, f)
+
+
+def make_json_file_using_r(dir_name, glb_param_dict): # use r script to make it faster
+ r_code = '''
+ library(rjson)
+ dir.name <- '%s'
+ tpm.file <- '%s'
+ take.log <- '%s'
+ X <- read.table(tpm.file, header=T, check.names=FALSE, sep="\\t")
+ gene.id <- as.vector(X[,1])
+ X[,1] <- NULL # remove first column
+ if (take.log == 'YES') {
+ X <- log(X+1)
+ }
+ if (!dir.exists(dir.name)) {
+ dir.create(dir.name)
+ }
+ for (i in 1:dim(X)[1]) {
+ y <- toJSON(X[i,])
+ file.name = paste(dir.name, paste(gene.id[i], 'json', sep='.'), sep='/')
+ cat(y, file=file.name)
+ }
+ ''' % (
+ dir_name,
+ glb_param_dict['EXPRESSION_MATRIX'],
+ glb_param_dict['LOGRITHMIZE'].upper())
+ f = open('slice_TPM_to_JSON.R', 'w') # make a R script
+ f.write('\n'.join([line.lstrip('\t') for line in r_code.split('\n')]))
+ f.close()
+ os.system('Rscript slice_TPM_to_JSON.R')
+ os.system('rm -f slice_TPM_to_JSON.R')
+
+
+## main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+#expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
+#make_json_file(expr_dict, JSON_DIR, glb_param_dict) # slower version
+make_json_file_using_r(JSON_DIR, glb_param_dict) # faster version
diff --git a/Code/slice_binding_to_JSON.py b/Code/slice_binding_to_JSON.py
new file mode 100644
index 0000000..6421fed
--- /dev/null
+++ b/Code/slice_binding_to_JSON.py
@@ -0,0 +1,172 @@
+# Usage: python slice_binding_to_JSON.py parameter_for_net.txt
+import sys, os, operator, itertools
+import numpy as np
+import json
+
+JSON_DIR = '../Data/history/bind/json2' # contains json for all genes
+
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+# read paramters
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+def read_info_data(fname):
+ ''' Read chip-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+
+def make_json_file(bind_dict, bind_info_dict, dir_name, glb_param_dict):
+ if not os.path.isdir(dir_name): # create the directory if not exist
+ os.makedirs(dir_name)
+
+ d = bind_dict['xy']
+ col_name_lst = bind_dict['colid']
+ row_name_lst = bind_dict['rowid']
+ for g in row_name_lst:
+ #print(g)
+ d2 = d[g]
+ d3 = {}
+ for k in sorted(d2.keys()):
+ data_type = bind_info_dict[k]['DATA_FORMAT'].upper()
+ if data_type == 'NARROWPEAK':
+ data_type = 'NP' # short name for narrowPeak
+ value = d2[k]
+ d3[k] = {'v':value, 't':data_type}
+ filename = os.path.join(dir_name, g + '.json')
+ with open(filename, 'w') as f:
+ json.dump(d3, f)
+
+
+### main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+#print('Read binding matrix ...')
+binding_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+#print('Make json files ...')
+make_json_file(binding_dict, bind_info_dict, JSON_DIR, glb_param_dict)
diff --git a/Code/test_network4.py b/Code/test_network4.py
new file mode 100644
index 0000000..44ce492
--- /dev/null
+++ b/Code/test_network4.py
@@ -0,0 +1,205 @@
+# Make tissue specific networks
+
+import os, sys
+from geneid2name import make_gene_name_AGI_map_dict
+
+def get_tfs(fname_lst):
+ d = {}
+ for fname in fname_lst:
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ tf = lst[1].split()[0]
+ if not tf in d:
+ d[tf] = 1
+ else:
+ d[tf] += 1
+ f.close()
+ return d
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+def get_edges_consisting_of_tfs(fname_lst, tf_dict):
+ d = {}
+ for fname in fname_lst:
+ kt = get_tissue_from_fname(fname)
+ d[kt] = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0].strip()
+ tf = lst[1].split()[0].strip()
+ k = target + '_' + tf
+ score = float(lst[2])
+ if tf in tf_dict and target in tf_dict:
+ if not k in d[kt]:
+ d[kt][k] = [(lst[0], lst[1], score)]
+ else:
+ d[kt][k].append((lst[0], lst[1], score))
+ f.close()
+ return d
+
+def get_degree(fname_lst, tf_dict):
+ d_out = {}
+ d_in = {}
+ d_all = {}
+ for fname in fname_lst:
+ kt = get_tissue_from_fname(fname)
+ d_out[kt] = {}
+ d_in[kt] = {}
+ d_all[kt] = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0].strip()
+ tf = lst[1].split()[0].strip()
+ if True or tf in tf_dict and target in tf_dict:
+ if not tf in d_out[kt]:
+ d_out[kt][tf] = 1
+ else:
+ d_out[kt][tf] += 1
+
+ if not target in d_in[kt]:
+ d_in[kt][target] = 1
+ else:
+ d_in[kt][target] += 1
+
+ if not target in d_all[kt]:
+ d_all[kt][target] = 1
+ else:
+ d_all[kt][target] += 1
+
+ if not tf in d_all[kt]:
+ d_all[kt][tf] = 1
+ else:
+ d_all[kt][tf] += 1
+
+ f.close()
+ return d_all, d_out, d_in
+
+
+def simplify(s):
+ result = ''
+ lst = s.split('\t')
+ a = (lst[0].split()[1]).split(';')[0]
+ if a == '.':
+ a = lst[0].split()[0]
+ else:
+ a = lst[0].split()[0] + '_' + (lst[0].split()[1]).split(';')[0]
+ b = (lst[1].split()[1]).split(';')[0]
+ if b == '.':
+ b = lst[1].split()[0]
+ else:
+ b = lst[1].split()[0] + '_' + (lst[1].split()[1]).split(';')[0]
+ return '%s\t%s\t%s' % (a, b, lst[2])
+
+# main
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file_lst = [
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.seedling.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.meristem.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.flower.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.aerial.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.shoot.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.seed.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.leaf.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.root.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.stem.txt.20170629_203729'
+]
+
+tf_dict = get_tfs(edge_file_lst)
+
+f = open('result.skeleton.txt', 'w')
+print('Total number of TFs: %d' % (len(tf_dict)))
+d0 = get_edges_consisting_of_tfs(edge_file_lst, tf_dict)
+for kt in d0: # kt is tissue
+ f.write('##TF skeleton size in %s: %d.\n' % (kt, len(d0[kt])))
+ d = d0[kt]
+ for k in d:
+ lst = d[k]
+ for x in lst: # {'shoot':{'target_tf':[], }, 'flower':{} }
+ max_score = -9
+ s = ''
+ if abs(x[2]) > max_score:
+ s = '%s\t%s\t%4.2f' % (x[0], x[1], x[2])
+ max_score = x[2]
+ f.write(simplify(s) + '\n')
+f.close()
+
+# for each TF, get its out-degree and in-degree in each tissue
+dd_all, dd_out, dd_in = get_degree(edge_file_lst, tf_dict)
+f = open('result.out.txt', 'w')
+head_lst = ['TF']
+for k in dd_out:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_out:
+ if tf in dd_out[k]:
+ s += '\t%d' % (dd_out[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
+
+f = open('result.in.txt', 'w')
+head_lst = ['TF']
+for k in dd_in:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_in:
+ if tf in dd_in[k]:
+ s += '\t%d' % (dd_in[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
+
+f = open('result.all.txt', 'w')
+head_lst = ['TF']
+for k in dd_all:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_all:
+ if tf in dd_all[k]:
+ s += '\t%d' % (dd_all[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
diff --git a/Code/text2json.py b/Code/text2json.py
new file mode 100644
index 0000000..dcfb699
--- /dev/null
+++ b/Code/text2json.py
@@ -0,0 +1,19 @@
+# Usage: python text2json AGI-to-gene-names_v2.txt > genes.json
+# Purpose: convert AGI-to-gene-names_v2.txt to genes.json for brain main page. Put genes.json under ../Webapp/static/json
+import sys
+
+f = open(sys.argv[1])
+
+count = 0
+s = '{'
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ s += '\"%s\":' % ('label'+str(count))
+ s += '\"%s\",' % (lst[0]+' '+lst[1])
+ count += 1
+
+f.close()
+s = s[:-1]
+s += '}'
+print(s)
diff --git a/Code/update_network.py b/Code/update_network.py
new file mode 100755
index 0000000..e29eac1
--- /dev/null
+++ b/Code/update_network.py
@@ -0,0 +1,895 @@
+#! /usr/bin/python3
+# Usage: python3 update_network.py
+# Put this script under directory Code/.
+# IMPORTANT: Run this script under directory Code/.
+# Execute the above command regularly, or
+# Cron job this command to make it run everyday at 5am:
+#
+# 1. crontab -e.
+# 2. Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py
+#
+# IMPORTANT: Make sure execute this script (update_network.py) under the directory Code.
+#
+# Purpose: periodically (e.g., per week) run this script to see if the network needs update. If yes, update it.
+#
+# Set HOLDON=NO in parameter_for_buildCmatrix.txt,
+# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make
+# changes in these file effective.
+#
+# parameter_for_buildCmatrix.txt will be updated automatically (I
+# hope). However, we need to update parameter_for_buildCmatrix.txt
+# manually.
+#
+# Revision history:
+#
+# Last modified: 26 Feb 2017
+# Last modified: 17 Mar 2017
+# Last modified: 04 Apr 2017
+# Last modified: 05 Apr 2017
+# Last modified: 10 Apr 2017
+# Last modified: 19 Apr 2017
+# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R]
+# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R]
+# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ]
+# Last modified: 23 Jan 2018 [edited a few print-out messages]
+# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement]
+# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)]
+# Last modified: 03 Feb 2019
+# Last modified: 08 Aug 2019, hui
+# Last modified: 10 Aug 2019, hui
+# Last modified: 23 Aug 2019, hui [correlation_mixtools(num_component)]
+# Last modified: 10 Sep 2019, hui [correlation_mixtools, check the previous R session has finished before starting a new one.]
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+from param4net import make_global_param_dict, get_key_value
+from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \
+ PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \
+ PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \
+ BINDING_FILE, TPM_FILE, \
+ PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \
+ RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \
+ MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \
+ EDGE_POOL_DIR, MERGED_EDGE_FILE, \
+ TARGET_TF_FILE
+
+
+
+## Helper functions
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+
+def validate_webapp_dir(para_for_net):
+ ''' Make sure this function is executed under the directory Code. '''
+ glb_param_dict = make_global_param_dict(para_for_net)
+ # if genes.json is not present, create one
+ if not os.path.exists('../Webapp/static/json/genes.json'):
+ print('[update_network.py]: cannot find genes.json, make one ...')
+ cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
+ os.system(cmd)
+
+
+def make_paths(s):
+ if not os.path.isdir(s):
+ os.makedirs(s)
+
+
+def make_important_dirs():
+ make_paths('../Data/history/edges/many_targets')
+ make_paths('../Data/history/edges/one_target')
+ make_paths('../Data/log')
+ make_paths('../Data/information')
+ make_paths('../Data/temp')
+ make_paths('../Data/upload')
+ make_paths('../Data/parameter')
+ make_paths('../Data/R/Mapped')
+ make_paths('../Data/R/Mapped/public')
+ make_paths('../Data/R/Mapped/inhouse')
+ make_paths('../Data/R/Mapped/other')
+ make_paths('../Data/R/Raw')
+ make_paths('../Data/C/Mapped')
+ make_paths('../Data/C/Raw')
+ make_paths('../Data/history/edges')
+ make_paths('../Data/history/edge_pool')
+ make_paths('../Data/history/bind')
+ make_paths('../Data/history/expr')
+ make_paths('../Webapp/static/json')
+ make_paths('../Webapp/static/edges')
+ make_paths('../Webapp/templates')
+
+
+def num_line(fname):
+ ''' Return number of lines in file fname. '''
+ if not os.path.exists(fname):
+ return 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+
+def num_ids(fname):
+ ''' Return number of IDs in fname. '''
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines[0].split('\t')) - 1
+
+
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+ print('Log: %s' % (s.strip()))
+
+
+def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
+ if not os.path.exists(sample_size_file):
+ f = open(sample_size_file, 'w')
+ else:
+ f = open(sample_size_file, 'a')
+ f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
+ f.close()
+
+
+def age_of_file_in_days(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ days = (time.time() - st.st_mtime)/(3600*24.0)
+ return days
+
+
+def age_of_file_in_seconds(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ seconds = time.time() - st.st_mtime
+ return seconds
+
+
+def hold_on(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[:100]: # check the first 100 lines for HOLDON
+ line = line.strip()
+ if line.startswith('%%HOLDON=YES'):
+ return True
+ return False
+
+
+def all_files_present(lst):
+ missing_file_lst = []
+ for path in lst: # lst is a list of file names to check
+ if not os.path.exists(path):
+ if 'edges.txt' in path:
+ write_log_file('[update_network.py] WARNING: must have %s to update network. Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE)
+ missing_file_lst.append(path)
+ return missing_file_lst
+
+
+def record_file_time(lst, fname):
+ '''
+ lst - a list of files
+ fname - a recorder file
+ '''
+ f = open(fname, 'w')
+ s = ''
+ for x in lst:
+ if os.path.exists(x):
+ s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
+ else:
+ s += '%s\t%d\n' % (os.path.basename(x), 0)
+ f.write(s)
+ f.close()
+
+
+def read_file_timestamp(ftimestamp):
+ d = {}
+ f = open(ftimestamp)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ fname = lst[0]
+ t = lst[1]
+ d[fname] = int(t)
+
+ f.close()
+ return d
+
+
+def file_updated(fname, d):
+ ft = int(os.stat(fname).st_mtime)
+ k = os.path.basename(fname)
+ return ft > d[k]
+
+
+def get_updated_files(lst, d):
+ result = []
+ for x in lst:
+ if file_updated(x, d):
+ result.append(os.path.basename(x))
+ return result
+
+
+def get_sample_size(d, sorted_keys, day, rcond_string):
+
+ if rcond_string.isdigit():
+ return int(rcond_string)
+
+ if len(d) == 0:
+ return 1200 # a default number of sample size, CHANGE
+
+ for x in sorted_keys:
+ if x >= day:
+ return d[x]
+
+ k = sorted_keys[-1] # last key, latest date
+ return d[k]
+
+
+def number_rnaseq_id(tpm_file):
+ f = open(tpm_file)
+ first_line = f.readlines()[0]
+ f.close()
+ first_line = first_line.strip()
+ return len(first_line.split()) - 1
+
+
+def number_rnaseq_diff(para_file, tpm_file):
+ ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
+ a = 0
+ f = open(para_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ a += 1
+ f.close()
+
+ b = number_rnaseq_id(tpm_file)
+
+ return a - b
+
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('[update_network.py]:Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('[update_network.py]:Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('[Warning] update_network.py: Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('[update_network.py]:Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('[update_network.py]:Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('[update_network.py]:Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('[update_network.py]:Must contain at least one ChIP-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_buildrmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('[update_network.py]:Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_LIST' in d:
+ print('[update_network.py]:Must specify GENE_LIST.')
+ sys.exit()
+ if location_count == 0:
+ print('[update_network.py]:Must contain at least one RNA-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_net(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_ID_AND_GENE_NAME':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'EXPRESSION_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_MATRIX':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if k == 'EXPRESSION_MATRIX':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+ if not 'GENE_LIST' in d:
+ print('[update_network.py]:Must specify GENE_FILE.')
+ sys.exit()
+ if not 'GENE_ID_AND_GENE_NAME' in d:
+ print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.')
+ sys.exit()
+ if not 'BINDING_INFO' in d:
+ print('[update_network.py]:Must specify BINDING_INFO.')
+ sys.exit()
+ if not 'EXPRESSION_INFO' in d:
+ print('[update_network.py]:Must specify EXPRESSION_INFO.')
+ sys.exit()
+ if not 'BINDING_MATRIX' in d:
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if not 'EXPRESSION_MATRIX' in d:
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+
+
+def need_update_parameter_file(param_file, dirs):
+ ''' Make sure param_file is consistent with dirs (a list of directories to check against). '''
+ result = []
+
+ files_in_parameter = {}
+ f = open(param_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('LOCATION:'):
+ lst = line.split(':')
+ k = os.path.abspath(lst[1])
+ files_in_parameter[k] = 1
+ f.close()
+ param_modification_time = os.path.getmtime(param_file)
+
+ files_in_dirs = {}
+ for directory in dirs:
+ for root, dirnames, filenames in os.walk(os.path.abspath(directory)):
+ for filename in filenames:
+ k = os.path.join(root, filename)
+ files_in_dirs[k] = 1
+ if 'narrowPeak' in k or '_quant' in k:
+ if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time:
+ result.append('%s is not in %s' % (k, param_file))
+
+ return result
+
+
+
+def validate_binding_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip()
+ if 'buildCmatrix: ChIP-seq ID list is empty.' in line:
+ return False
+ f.close()
+ return True
+
+
+def lines_with_10_fields(s):
+ result = []
+ for line in s.split('\n'):
+ line = line.strip()
+ if len(line.split('\t')) == 10:
+ result.append(line)
+ return result
+
+
+def concatenate_edge_files(fname_lst, fname_out):
+ fout = open(fname_out, 'w')
+ for fname in fname_lst:
+ f = open(fname)
+ s = f.read()
+ f.close()
+ # Make sure each edge has 10 fields before writing.
+ lines = lines_with_10_fields(s)
+ if lines != []:
+ write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
+ fout.write('\n'.join(lines) + '\n')
+ else:
+ write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
+ fout.close()
+
+
+def delete_edge_files(fname_lst):
+ for fname in fname_lst:
+ # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
+ if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes
+ os.remove(fname)
+ else:
+ write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE)
+
+
+def create_edges0():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+ # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables.
+ if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL)
+ #os.system(cmd)
+
+
+def create_edges0B():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
+ os.system(cmd)
+
+
+def wedge():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript wedge.R'
+ os.system(cmd)
+
+
+def correlation_per_group():
+ # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours.
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript correlation_per_group.R'
+ os.system(cmd)
+
+
+def correlation_per_group_fixed_number():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript correlation_per_group_fixed_number.R'
+ os.system(cmd)
+
+
+def correlation_mixtools(num_component):
+ if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one. os.system returns 0 if R is running.
+ write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d). create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript create_edges_mixtool.R %d' % (num_component)
+ os.system(cmd)
+
+
+def check_rnaseq_info():
+ # check rnaseq_info_database.txt and rnaseq_info_database.json, if they are outdated, then remind us to update it in log file.
+ if os.path.exists(RNA_SEQ_INFO_DATABASE):
+ if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 90: # older than 120 days
+ write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE)
+ else:
+ write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE)
+
+ if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 90:
+ write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE)
+ else:
+ write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE)
+
+
+# def check_process(name):
+# ''' If a process name exists, return 1; otherwise return 0.'''
+# os.system('ps -eF | grep \'%s\' > ../Data/running_processes.txt' % (name))
+# f = open('../Data/running_processes.txt')
+# lines = f.readlines()
+# f.close()
+# for line in lines:
+# line = line.strip()
+# lst = line.split()
+# if 'python' in lst[-2] and name in lst[-1]:
+# return 1
+# return 0
+
+
+
+## main
+
+# if check_process('update_network.py') == 1: # the old update_network.py is running
+# write_log_file('[update_network.py] update_network.py has not finished yet.', UPDATE_NETWORK_LOG_FILE)
+# sys.exit()
+
+
+FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \
+ MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of very important files
+
+make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR)
+#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json.
+
+check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots
+
+# Make sure all necessary files are present, if not, make them if possible
+miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
+if miss_lst != []: # miss_lst is non-empty in the beginning.
+ print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst)))
+ write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE)
+
+ # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ...
+ important_miss_number = 0
+ if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_NET in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET))
+ important_miss_number += 1
+
+ if important_miss_number > 0:
+ sys.exit() # need to provide all the above three files; otherwise cannot proceed
+
+ if BINDING_FILE in miss_lst:
+ print('[update_network.py]: make initial binding.txt ... wait')
+ write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
+ #os.system(cmd)
+ cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
+ #os.system(cmd)
+ print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
+ sys.exit()
+
+ if TPM_FILE in miss_lst:
+ print('[update_network.py]: make initial TPM.txt ... wait')
+ write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt
+ #os.system(cmd)
+ print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
+ sys.exit()
+
+ miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
+ if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one.
+ print('[update_network.py]: make initial edges.txt ... wait')
+ create_edgeds0()
+
+
+# Make json2 (sliced binding.txt) if it does not exist. Copy json2 to
+# the web application folder static/edges [manual] for displaying
+# binding strength plots.
+if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE):
+ write_log_file('Make directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+
+# Make json (sliced TPM.txt) if it does not exist. Copy json to the
+# web application folder static/edges [manual] for displaying gene
+# expression scatterplots.
+if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE):
+ write_log_file('Make directory ../Data/history/expr/json. Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+
+# Make sure parameter files are present and valid (rudimentary check but important)
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
+validate_parameter_for_net(PARAMETER_FOR_NET)
+
+
+# If the file timestamp does not exist, create one
+if not os.path.exists(FILE_TIMESTAMP):
+ record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+
+# get update time of mandatory files
+timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)
+
+
+
+################## binding.txt stuff #####################################
+# Check parameter_for_buildCmatrix.txt
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX):
+ write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated
+ os.system(cmd)
+
+
+ # # We will only consider ChIP-seq IDs that are less than 7 days
+ # # old. Make sure put 'update:yymmdd' in the 'NOTE:' field in
+ # # parameter_for_buildCmatrix.txt for each newly added ChIP-seq
+ # # data.
+ # write_log_file('[update_network.py] Build binding matrix from recently added/modified ChIP-seq data.', UPDATE_NETWORK_LOG_FILE)
+ # TEMP_BINDING_FILE = BINDING_FILE + '.temp'
+ # cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, TEMP_BINDING_FILE)
+ # os.system(cmd)
+
+ # # If someone just touched prameter_for_buildCmatrix.txt without
+ # # adding any new ChIP-seq data, we should do nothing.
+ # if validate_binding_file(TEMP_BINDING_FILE):
+ # write_log_file('[update_network.py] Overwrite binding.txt.', UPDATE_NETWORK_LOG_FILE)
+ # cm = 'mv %s %s' (TEMP_BINDING_FILE, BINDING_FILE) # Overwrite binding.txt. Make it formal.
+ # os.system(cmd)
+ # write_log_file('[update_network.py] binding.txt is updated. Number of columns in %s = %d.' % (BINDING_FILE, num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
+
+ # write_log_file('[update_network.py] Update target tf file %s.' % (TARGET_TF_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
+ # os.system(cmd)
+ # else:
+ # write_log_file('[update_network.py] [WARNING] Invalid binding matrix.', UPDATE_NETWORK_LOG_FILE)
+ # os.remove(TEMP_BINDING_FILE)
+
+
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'binding.txt' in updated_file_list:
+ write_log_file('[update_network.py] binding.txt has been updated. This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE)
+ # create_edges0()
+ # create_edges0B()
+ # wedge()
+ # correlation_per_group()
+ # correlation_per_group_fixed_number()
+ # correlation_mixtools(2)
+ # correlation_mixtools(3)
+
+ ## TODO mixtool stuff, forget it for now.
+ #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name)
+ #os.system(cmd)
+
+
+
+
+################## TPM.txt stuff #####################################
+
+# update parameter_for_buildRmatrix.txt periodically and automatically.
+if datetime.now().day % PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly
+ curr_time = datetime.now().strftime('%Y%m%d%H%M')
+ new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time)
+ cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished.
+ os.system(cmd)
+ num = number_rnaseq_diff(new_parameter_file, TPM_FILE)
+ if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added
+ write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX)
+ os.system(cmd)
+
+ # Before we rewrite TPM.txt, we should backup the old TPM.txt
+ # write_log_file('[update_network.py] Backup %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'cp %s %s' % (TPM_FILE, TPM_FILE + '.backup.at.' + curr_time)
+ # os.system(cmd)
+
+ # write_log_file('[update_network.py] Rebuild %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'python3 buildRmatrix.py ../Data/parameter/parameter_for_buildRmatrix.txt'
+ # os.system(cmd)
+
+ else:
+ write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt. TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE)
+
+
+# Check if parameter_for_buildRmatrix.txt has been updated
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+# TODO To simplify things, I will provide TPM.txt directly. So set the
+# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent
+# the following from being True.
+if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX):
+ write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
+ if os.path.exists(TPM_FILE):
+ backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time)
+ cmd = 'cp %s %s' % (TPM_FILE, backup_file_name)
+ os.system(cmd)
+ cmd = 'gzip %s' % (backup_file_name)
+ os.system(cmd)
+
+ cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, the location of which is specified in TPM_TABLE in buidlRmatrix.py
+ os.system(cmd)
+
+ curr_date = datetime.now().strftime('%Y%m%d')
+ tpm_sample_size = number_rnaseq_id(TPM_FILE)
+ write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size)
+
+
+
+# Create edges using all RNA-seq experiments
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent. We will recompute edges using the full binding.txt.
+ # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges
+ write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs. Pay attention to include-all in the command-line argument.
+ os.system(cmd)
+
+ # target_tf.txt
+ write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
+ os.system(cmd)
+
+ write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt. Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE)
+ ## json -- make/renew json directory for displaying scatterplots
+ cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots
+ write_log_file('[update_network.py] Update directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ #os.system(cmd) # turn this on if we are going to use this bindingtxt for displaying bar charts of binding strengths
+ ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual]
+
+ if False: # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster.
+ write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE)
+ os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead. The export command can be put in crontab -e before running this script
+ cmd = 'python3 assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt'
+ os.system(cmd)
+ cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt'
+ os.system(cmd)
+ cmd = 'python3 update_rnaseq_info_json.py'
+ os.system(cmd)
+
+
+
+ # Compute edges. This could take a lot of time so update FILE_TIMESTAMP first.
+ record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+ create_edges0()
+ create_edges0B()
+ wedge()
+ correlation_per_group()
+ correlation_per_group_fixed_number()
+ #correlation_mixtools(2)
+ #correlation_mixtools(3)
+
+
+########## Merge edges #######################
+# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
+time.sleep(5)
+edge_file_lst = [] # collect edge files.
+most_recent_edge_modification_time = 0
+write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+
+if edge_file_lst == []:
+ write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
+ # concatenate edge files into one
+ write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+ concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
+ delete_edge_files(edge_file_lst)
+
+if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
+ write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 merge_edges.py'
+ os.system(cmd)
+ write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
+
+
+# exclude edges as suggested by Phil Wigge.
+# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
+# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
+#os.system(cmd)
+
+
+# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
+# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
+# write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+# time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname
+# cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET) # put process to background
+# os.system(cmd)
+# time.sleep(60)
+
+
+# remove .R files in ../Data/temp. Files older than 3 days will be removed
+cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR)
+os.system(cmd)
+
+# update time stamp file
+record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+
+write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
+
diff --git a/Code/update_network_by_force.py b/Code/update_network_by_force.py
new file mode 100644
index 0000000..7ba8c87
--- /dev/null
+++ b/Code/update_network_by_force.py
@@ -0,0 +1,113 @@
+# Usage: python3 update_network_by_force.py
+# Purpose: update_network.py could take a few days to run. Run this script to harvest new edges everyday.
+#
+# Revision history:
+# Last modified: 24 Nov 2019, hui
+
+import os, sys
+import glob
+import time
+from datetime import datetime
+from configure import HISTORY_DIR, HISTORY_DIR2, UPDATE_NETWORK_LOG_FILE, MERGED_EDGE_FILE, EDGE_POOL_DIR
+
+########## Helper functions #######################
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+ print('Log: %s' % (s.strip()))
+
+
+def num_line(fname):
+ ''' Return number of lines in file fname. '''
+ if not os.path.exists(fname):
+ return 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+
+def lines_with_10_fields(s):
+ result = []
+ for line in s.split('\n'):
+ line = line.strip()
+ if len(line.split('\t')) == 10:
+ result.append(line)
+ return result
+
+
+def age_of_file_in_seconds(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ seconds = time.time() - st.st_mtime
+ return seconds
+
+def concatenate_edge_files(fname_lst, fname_out):
+ fout = open(fname_out, 'w')
+ for fname in fname_lst:
+ f = open(fname)
+ s = f.read()
+ f.close()
+ # Make sure each edge has 10 fields before writing.
+ lines = lines_with_10_fields(s)
+ if lines != []:
+ write_log_file('[update_network_by_force.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
+ fout.write('\n'.join(lines) + '\n')
+ else:
+ write_log_file('[update_network_by_force.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
+ fout.close()
+
+
+def delete_edge_files(fname_lst):
+ age_in_hours = 6
+ for fname in fname_lst:
+ # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
+ if age_of_file_in_seconds(fname) > age_in_hours*60*60: # 6 hours
+ os.remove(fname)
+ else:
+ write_log_file('[update_network_by_force.py] In function delete_edge_files. Check file %s. It is probably still being written (age less than %d hours). So I don\'t delete it.' % (fname, age_in_hours), UPDATE_NETWORK_LOG_FILE)
+
+########## Merge edges #######################
+# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
+time.sleep(3)
+edge_file_lst = [] # collect edge files.
+most_recent_edge_modification_time = 0
+write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+
+if edge_file_lst == []:
+ write_log_file('[update_network_by_force.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
+ # concatenate edge files into one
+ write_log_file('[update_network_by_force.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+ concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
+ delete_edge_files(edge_file_lst)
+
+if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
+ write_log_file('[update_network_by_force.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network_by_force.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 merge_edges.py'
+ os.system(cmd)
+ write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
+
+
+
+write_log_file('[update_network_by_force.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
diff --git a/Code/update_rnaseq_info_json.py b/Code/update_rnaseq_info_json.py
new file mode 100644
index 0000000..4d6b654
--- /dev/null
+++ b/Code/update_rnaseq_info_json.py
@@ -0,0 +1,89 @@
+# Usage: python update_rnaseq_info_json.py
+# Provide two files old_json and tissue_file
+#
+# Purpose: update the tissue field in rnaseq_info_database.json. Make
+# Data/information/experiment.and.tissue.txt in which rnaseq samples
+# with unknown tissues are predicted using knn_classify.R with K=1.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import json, os, sys
+
+def get_sra_id(x):
+ if 'RR' in x:
+ index1 = x.find('RR')
+ index2 = x.find('X')
+ if index2 == -1:
+ index2 = len(x)
+ return x[index1-1:index2]
+ return x
+
+def make_tissue_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ d = {}
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ x = lst[0]
+ y = get_sra_id(x)
+ d[y] = lst[4]
+ f.close()
+ return d
+
+def update_tissue_dict_and_tissue_file(d, fname, fname_pred):
+
+ f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '' and len(lst) >= 2:
+ y = get_sra_id(lst[0])
+ d[y] = lst[1]
+
+ f = open(fname)
+ lines = f.readlines()
+ head_line = lines[0].strip()
+ f.close()
+ file_lines = [head_line]
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '' and len(lst) >= 5:
+ k = get_sra_id(lst[0])
+ if lst[4] == 'unknown' and k in d:
+ lst[4] = d[k]
+ file_lines.append('\t'.join(lst))
+ outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown
+ f = open(outfile, 'w')
+ f.write('\n'.join(file_lines) + '\n')
+ f.close()
+ return d
+
+
+# main
+RSCRIPT_FILE = 'knn_classify.R'
+old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
+tissue_file = '../Data/information/experiment.and.tissue.2.txt' # generated by refine_tissue.py
+tissue_dict = make_tissue_dict(tissue_file)
+if os.path.exists(RSCRIPT_FILE):
+ cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt
+ os.system(cmd)
+ tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt')
+
+with open(old_json) as json_data:
+ json_dict = json.load(json_data)
+ for k in json_dict:
+ if k in tissue_dict:
+ json_dict[k]['tissue'] = tissue_dict[k]
+
+cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json)
+os.system(cmd)
+fname = old_json
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+print('Check updated %s.' % (old_json))
diff --git a/Code/validate_parameter_for_buildCmatrix.py b/Code/validate_parameter_for_buildCmatrix.py
new file mode 100644
index 0000000..ced6062
--- /dev/null
+++ b/Code/validate_parameter_for_buildCmatrix.py
@@ -0,0 +1,85 @@
+# Usage: python validate_parameter_for_buildCmatrix.py
+# Purpose: make sure all files exist.
+# Hui 24 Jan 2018 Jinhua
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one ChIP-seq.')
+ sys.exit()
+
+## main
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+
diff --git a/Code/wedge.R b/Code/wedge.R
new file mode 100644
index 0000000..50039eb
--- /dev/null
+++ b/Code/wedge.R
@@ -0,0 +1,138 @@
+# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua
+#DATA.FILE <- '../Data/history/expr/TPM.txt.3130'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TARGET.TF.FILE <- '../Data/information/target_tf.txt'
+AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt'
+ONE.TARGET.DIR <- '../Data/history/edges/one_target'
+
+# Make sure we have required files and directory
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE))
+}
+
+if (! file.exists(TARGET.TF.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE))
+}
+
+if (! file.exists(AGINAME.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE))
+}
+
+
+if (! dir.exists(ONE.TARGET.DIR)) {
+ stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR))
+}
+
+
+r.tau <- 0.60
+
+cat(sprintf('Read %s\n', DATA.FILE))
+X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+all.genes <- rownames(X)
+
+cat(sprintf('Read %s\n', AGINAME.FILE))
+#agi <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+
+cat(sprintf('Read %s\n', TARGET.TF.FILE))
+target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
+total.pair <- dim(target.tf)[1]
+
+###########################################################################
+post.translation.4 <- function(x, y) {
+ mx = mean(x)
+ index = (x > mx - 0.5) & (x < mx + 0.5)
+ slope = max(y[index])/mx
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index0 = which(z <= 0) # points below the wedge
+ index1 = which(z > 0) # points above the wedge
+ index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too
+ if (length(index2) > 0) {
+ q = quantile(y[index2], 0.9)
+ m = mean(y[index2])
+ } else {
+ q = 0.0
+ m = 0.0
+ }
+ index3 = which(x < 1)
+ if (length(index3) > 0) {
+ m = mean(y[index3])
+ } else {
+ m = 0.0
+ }
+ # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and
+ # m < slope, disp.x < disp.y
+ result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y))
+}
+
+
+make.data <- function(slope, n) {
+ x=abs(3.0 + 1*rnorm(n))
+ y=abs(3.0 + 1*rnorm(n))
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index = which(z <= 0)
+ result <- list(x=x[index], y=y[index])
+}
+###########################################################################
+
+cat(sprintf('Go through pairs looking for wedge shapes ..\n'))
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:total.pair) {
+ id1 <- as.vector(target.tf[i,2]) # tf
+ id2 <- as.vector(target.tf[i,1]) # target
+
+ all.in <- id1 %in% all.genes & id2 %in% all.genes
+ if (!all.in) {
+ next
+ }
+
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ x <- t(x)
+ y <- t(y)
+ na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y))
+ index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included
+ x <- x[!index, 1, drop=FALSE]
+ y <- y[!index, 1, drop=FALSE]
+
+ if (dim(x)[1] < 50) {
+ next
+ }
+
+ # We will not consider wedge shape if the correlation coefficient is large enough.
+ if (abs(cor(x, y)) < r.tau) {
+ result <- post.translation.4(x, y)
+ if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) {
+ #name1 <- agi[id1,1]
+ #name2 <- agi[id2,1]
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+ max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m)))
+ curr.date <- gsub('-','',Sys.Date())
+ loglik <- '-1001.0'
+ rna.sample <- row.names(x)[result$below] # below the diagonal line
+ #rna.sample.size <- length(rna.sample)
+ #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs
+ sub.cond <- paste(rna.sample, collapse=' ')
+ sub.cond.length <- length(rna.sample)
+ cond <- as.vector(target.tf[i,3])
+ result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge')
+
+ cat(result2, file=f, sep='')
+ }
+ }
+}
+
+close(f)
--
cgit v1.2.1