summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore19
-rw-r--r--Code/MixReg.R173
-rw-r--r--Code/TPM2JSON.py115
-rw-r--r--Code/assign_tissue.py139
-rw-r--r--Code/brain_number_genes_edges.py48
-rw-r--r--Code/buildCmatrix.py234
-rw-r--r--Code/buildRmatrix.py234
-rw-r--r--Code/common_peak.py119
-rw-r--r--Code/configure.py56
-rw-r--r--Code/correlation_per_group.R142
-rw-r--r--Code/correlation_per_group_fixed_number.R217
-rw-r--r--Code/correlation_per_tissue.R101
-rw-r--r--Code/count_word.py36
-rw-r--r--Code/create_edges.py840
-rw-r--r--Code/create_edges.r99
-rw-r--r--Code/create_edges0.py215
-rw-r--r--Code/create_edges0B.py217
-rw-r--r--Code/create_edges3.py615
-rw-r--r--Code/create_edges4.py450
-rw-r--r--Code/create_edges4_k2.py253
-rw-r--r--Code/create_edges4_k3.py255
-rw-r--r--Code/create_edges_k2.R136
-rw-r--r--Code/create_edges_mixtool.R154
-rw-r--r--Code/degree_of_separation.py43
-rw-r--r--Code/degree_of_separation2.py45
-rw-r--r--Code/delete_not_used_fastq.py43
-rw-r--r--Code/download_and_map.py390
-rw-r--r--Code/download_ena_metadata.py43
-rw-r--r--Code/exclude_edges.py55
-rw-r--r--Code/geneid2name.py29
-rw-r--r--Code/get_TPM_by_salmon.py142
-rw-r--r--Code/get_binding.py384
-rw-r--r--Code/html_network.py942
-rw-r--r--Code/json_test.py8
-rw-r--r--Code/knn_classify.R79
-rw-r--r--Code/local_network.py1114
-rw-r--r--Code/make_graphviz_file3B.py236
-rw-r--r--Code/make_graphviz_file3C.py273
-rw-r--r--Code/make_parameter_bw.py127
-rw-r--r--Code/make_parameter_dapseq2.py57
-rw-r--r--Code/make_parameter_dapseq3.py75
-rw-r--r--Code/make_parameter_rnaseq.py163
-rw-r--r--Code/make_target_tf.py290
-rw-r--r--Code/make_target_tf_agris.py39
-rw-r--r--Code/make_upload_chip_parameter.py233
-rw-r--r--Code/merge_edges.py190
-rw-r--r--Code/param4net.py25
-rw-r--r--Code/parse_ena_xml.py364
-rw-r--r--Code/parse_ena_xml_test.py307
-rw-r--r--Code/prepare_gene_file.py79
-rw-r--r--Code/process_3way_interaction.py48
-rw-r--r--Code/process_3way_interaction2.py58
-rw-r--r--Code/refine_tissue.py302
-rw-r--r--Code/rnaseq_or_chipseq.py92
-rw-r--r--Code/slice_TPM_to_JSON.py164
-rw-r--r--Code/slice_binding_to_JSON.py172
-rw-r--r--Code/test_network4.py205
-rw-r--r--Code/text2json.py19
-rwxr-xr-xCode/update_network.py895
-rw-r--r--Code/update_network_by_force.py113
-rw-r--r--Code/update_rnaseq_info_json.py89
-rw-r--r--Code/validate_parameter_for_buildCmatrix.py85
-rw-r--r--Code/wedge.R138
63 files changed, 13022 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..52dc540
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+# Don't track these folders
+brain.code.download.20190202/
+brain.code.download.20180728/
+Salmon/
+Trash/
+Webapp-brain/
+Webapp/
+code_review_drawScatterplot/
+brain.documentation/
+Analysis/
+
+Data/*
+!Data/parameter/parameter_for_buildRmatrix.txt
+!Data/parameter/parameter_for_buildCmatrix.txt
+!Data/parameter/parameter_for_net*
+
+Code/*.old*
+Code/*.pyc*
+Code/__pycache__/
diff --git a/Code/MixReg.R b/Code/MixReg.R
new file mode 100644
index 0000000..a109430
--- /dev/null
+++ b/Code/MixReg.R
@@ -0,0 +1,173 @@
+k.lst <- c(3)
+target <- 'AT2G28507'
+id2 <- target
+tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
+conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
+recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
+jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
+AGINAME_FILE <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
+
+post.translation <- function(x, y) {
+ mean.x <- mean(x)
+ sd.x <- sd(x)
+ index <- x > mean.x - sd.x & x < mean.x + sd.x
+ sd.y <- sd(y[index])
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.2 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ sd.x <- max(sd(x), 1) # a number above 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+ sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+ # value measure dispersion of y and percent of points within a window
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.3 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ upper.percentile <- 0.85 # used for computing vertical dispersion
+ lowest.n <- 3 # number of points with lowest x values
+ min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+ sd.x <- min(sd(x), 1) # a number between 0 and 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+ sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+
+ rst <- sort(x, index.return=T)
+ top.n <- sum(rst$x < 1)
+ top.n <- max(1, min(top.n, lowest.n))
+ small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+ small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+ # value measure dispersion of y and percent of points within a window
+ result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+in.component <- function(posterior, k) {
+ # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+ p = posterior[,k]
+ n = length(p)
+ index <- rep(F,n)
+ for (i in 1:n) {
+ if (p[i] > runif(1)) {
+ index[i] = T
+ }
+ }
+ result <- index
+}
+
+####### Read data #########################################
+CORR_THRESHOLD <- 0.7
+agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+library(mixtools)
+library(rjson)
+name2 <- agi[id2,1]
+result <- ''
+for (i in 1:length(tfs)) {
+ if (recent.edge[i] == 1) {
+ next
+ }
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+
+ if (length(x) < MIN_SIZE) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- in.component(em.out$posterior, j)
+ size <- sum(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+ hit <- 0
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (hit == 0) {
+ t <- post.translation.3(x, y)
+ post.r <- t$percent
+ if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+ sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+}
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
+if (result != '') cat(result, file=output.file, sep='')
diff --git a/Code/TPM2JSON.py b/Code/TPM2JSON.py
new file mode 100644
index 0000000..6d5a423
--- /dev/null
+++ b/Code/TPM2JSON.py
@@ -0,0 +1,115 @@
+# Usage: python TPM2JSON.py parameter_for_net.txt
+# Purpose:
+# For each gene in TPM.txt, make a json file in directory JSON_DIR. So we don't need to load the whole TPM.txt later (more memory efficient).
+# 4 APR 2017, hui, slcu
+
+import sys, os, operator, itertools
+import numpy as np
+import json
+from param4net import make_global_param_dict
+
+JSON_DIR = '../Data/history/expr/jsonTPM' # Don't change this
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+def check_json_file(expr_dict, dir_name):
+ ''' Check if json files are good, return True if yes. '''
+
+ if not os.path.isdir(dir_name):
+ return False
+
+ d = expr_dict['xy']
+ col_name_lst = expr_dict['colid']
+ row_name_lst = expr_dict['rowid']
+ for g in row_name_lst[1:10]: # check the first 10 lines
+ d2 = d[g]
+ filename = os.path.join(dir_name, g + '.json')
+ if not os.path.exists(filename):
+ return False
+ with open(filename) as f:
+ d3 = json.load(f)
+ if len(d2) != len(d3):
+ return False
+
+ return True
+
+def make_json_file(expr_dict, dir_name):
+ if not os.path.isdir(dir_name): # create the directory if not exist
+ os.makedirs(dir_name)
+
+ d = expr_dict['xy']
+ col_name_lst = expr_dict['colid']
+ row_name_lst = expr_dict['rowid']
+ for g in row_name_lst:
+ d2 = d[g]
+ filename = os.path.join(dir_name, g + '.json')
+ with open(filename, 'w') as f:
+ json.dump(d2, f)
+
+
+## main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
+if not check_json_file(expr_dict, JSON_DIR):
+ make_json_file(expr_dict, JSON_DIR)
diff --git a/Code/assign_tissue.py b/Code/assign_tissue.py
new file mode 100644
index 0000000..782ab78
--- /dev/null
+++ b/Code/assign_tissue.py
@@ -0,0 +1,139 @@
+# Usage: python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt
+# Set TPM_FILE = , d = and d2 =
+#
+# Purpose: for each RNA-seq column in the TPM_FILE, get its tissue information
+# Excute this command to avoid encoding error: export PYTHONIOENCODING=UTF-8
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import os, sys, json
+import urllib2
+
+def make_tissue_dict(fname):
+ f = open(fname)
+ d = json.load(f)
+ f.close()
+ return d
+
+def get_experiment_id(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ result = []
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 2:
+ result.append(lst[1])
+ return result
+
+def get_sra_id(x):
+ if 'RR' in x:
+ index1 = x.find('RR')
+ index2 = x.find('X')
+ if index2 == -1:
+ index2 = len(x)
+ return x[index1-1:index2]
+ return x
+
+def make_sample_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 2:
+ runid = lst[0]
+ sample = lst[1]
+ d[runid] = (sample, ';'.join(lst[2:]))
+ return d
+
+
+def stringfy_json(d):
+ s = ''
+
+ if "organismPart" in d["characteristics"]:
+ s = d["characteristics"]["organismPart"][0]["text"]
+ else:
+ s = 'part_unknown'
+
+ if "accession" in d:
+ s += '\t' + d["accession"]
+ else:
+ s += '\t' + '.'
+
+ if "name" in d:
+ s += '\t' + d["name"]
+ else:
+ s += '\t' + '.'
+
+ if "synonym" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["synonym"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "ecotype" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["ecotype"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "developmentStage" in d["characteristics"]:
+ s += '\t' + d["characteristics"]["developmentStage"][0]["text"]
+ else:
+ s += '\t' + '.'
+
+ if "description" in d:
+ s += '\t' + d["description"] if d["description"] != None else '.'
+ else:
+ s += '\t' + '.'
+
+ return s
+
+def make_information(s, info_dir):
+ lst = s.split('...')
+ sample_id = lst[0]
+ filename = '%s/%s.json' % (info_dir, sample_id)
+ #url = 'https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession?accession=%s' % (sample_id)
+ if not os.path.exists(filename):
+ cmd = 'curl -s -H Content-Type:application/json https://www.ebi.ac.uk/biosamples/api/samples/search/findByAccession\?accession\=%s > %s' % (sample_id, filename)
+ os.system(cmd)
+
+ f = open(filename)
+ d = json.load(f)
+ f.close()
+ #d = json.load(urllib2.urlopen(url))
+ if len(d["_embedded"]["samples"]) > 0:
+ return stringfy_json(d["_embedded"]["samples"][0])
+ else:
+ return '.\t.'
+
+# main
+BIOSAMPLE_INFO_DIR = '/home/hui/network/v03/Data/information/BioSample' # put downloaded BioSample json files here
+if not os.path.isdir(BIOSAMPLE_INFO_DIR):
+ os.makedirs(BIOSAMPLE_INFO_DIR)
+
+# get first row in the TPM file
+TPM_FILE = '../Data/history/expr/TPM.txt'
+cmd = 'head -1 %s | perl -ne \'@words = split /\t/; $count = 1; for $x (@words) {print $count, "\t", $x, "\n"; $count++}\' > ../Data/temp/a.txt' % (TPM_FILE)
+os.system(cmd)
+
+lst = get_experiment_id('../Data/temp/a.txt')
+d = make_tissue_dict('../Data/information/rnaseq_info_database.json') # excuting parse_xml.py > rnaseq_info_database.txt, mainly for getting tissue names (inferred by word frequency in the description)
+d2 = make_sample_dict('../Data/information/rnaseq_info_database.txt') # parse_xml.py > rnaseq_info_database.txt, mainly for getting the BioSample id for each run
+head = ''
+for x in lst:
+ k = get_sra_id(x) # get rid of prefix R00... and suffix ..XX
+ s = x
+ if k in d:
+ s += '\t' + d[k]['tissue'] + '\t' + make_information(d2[k][0].decode('utf8'), BIOSAMPLE_INFO_DIR) + '\t' + d2[k][1].decode('utf8')
+ elif x.startswith('R0001') and ('146' in x or '147' in x): # inhouse data
+ s += '\t' + 'seedling\t' + '\t'.join(8*['.'])
+ elif x.startswith('R0002'): # pcubas (Spain) data
+ s += '\t' + 'meristem\t' + '\t'.join(8*['.'])
+ else:
+ s += '\t' + '\t'.join(9*['.']) # k is gene_id
+ print(s)
+ head += s.replace('\t', '_') + '\t'
diff --git a/Code/brain_number_genes_edges.py b/Code/brain_number_genes_edges.py
new file mode 100644
index 0000000..d1ace68
--- /dev/null
+++ b/Code/brain_number_genes_edges.py
@@ -0,0 +1,48 @@
+import os, operator, sys
+
+def make_dict(fname):
+ f = open(fname)
+ gene_dict = {}
+ target_dict = {}
+ tf_dict = {}
+ edge_dict = {}
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0].upper()
+ tf = lst[1].split()[0].upper()
+ if not tf in tf_dict:
+ tf_dict[tf] = 1
+ else:
+ tf_dict[tf] += 1
+
+ if not target in target_dict:
+ target_dict[target] = 1
+ else:
+ target_dict[target] += 1
+
+ if not tf in gene_dict:
+ gene_dict[tf] = 1
+ else:
+ gene_dict[tf] += 1
+ if not target in gene_dict:
+ gene_dict[target] = 1
+ else:
+ gene_dict[target] += 1
+ k = tf + '.' + target
+ if not k in edge_dict:
+ edge_dict[k] = 1
+ else:
+ edge_dict[k] += 1
+
+ f.close()
+ return tf_dict, target_dict, gene_dict, edge_dict
+
+
+# main
+edge_fname = '/home/hui/network/v03/Data/history/edges/edges.txt'
+tf_dict, target_dict, gene_dict, edge_dict = make_dict(edge_fname)
+print('Number of TFs: %d' % len(tf_dict))
+print('Number of targets: %d' % len(target_dict))
+print('Number of genes: %d' % len(gene_dict))
+print('Number of edges: %d' % len(edge_dict))
diff --git a/Code/buildCmatrix.py b/Code/buildCmatrix.py
new file mode 100644
index 0000000..5cbd30c
--- /dev/null
+++ b/Code/buildCmatrix.py
@@ -0,0 +1,234 @@
+# Usage: python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt
+#
+# Purpose: combine all binding columns into a matrix, binding.txt.
+# Each ChIP-seq experiment has the following binding column file in
+# the following format
+#
+# gene_id C0003000001450
+# AT1G01010 0
+# AT1G01020 0
+# AT1G01030 0
+# AT1G01040 0
+# AT1G01046 0
+# AT1G01050 0
+# AT1G01060 0
+# ...
+#
+# These column files are stored under DESTINATION in parameter_for_buildCmatrix.txt
+# Created on 3 JAN 2017 hui SLCU
+# Last modified 5 APR 2017 SLCU hui
+# Last modified 6 Aug 2019 Hui Lan <lanhui@zjnu.edu.cn> [now accept a second command-line argument 'include-all']
+
+import sys, os
+from datetime import datetime
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@' # followed by data name, or condition name, should be unique
+CHIP_SYMBOL = 'ChIP' # followed by ChIP peak file path
+CHIP_GENE_SYMBOL = 'ChIP_GENE' # followed by ChIP protein name (in upper case, gene id or gene name)
+RNA_SYMBOL = 'RNA' # followed by RNA data, a TPM table, first column is gene id, second column is TPM value
+BIGWIG_SYMBOL = 'BIGWIG' # future work
+DESCRI_SYMBOL = 'DESCRIPTION' # followed by data description
+####################################
+
+
+def get_key_value(s):
+ lst = s.split('=', 1)
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ if k == 'REBUILD_LIST' and v.lower() != 'all' and v != '':
+ d[k] = v.split() # make a list and rewrite d[k]
+ elif k == 'REBUILD_LIST':
+ d[k] = []
+ f.close()
+ return d
+
+
+def make_data_dict(fname):
+ ''' Scan parameter_for_buildCmatrix.txt and get its information into a dictionary, where key is ChIP-seq experiment ID, and value is a dictionary containing information for that experiment'''
+
+ # d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ # f = open(fname)
+ # lines = f.readlines()
+ # f.close()
+ # for line in lines:
+ # line = line.strip()
+ # if line == '' or line.startswith('#'):
+ # continue
+ # if line.startswith(DATA_SYMBOL):
+ # s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ # s = s.strip() # s is ChIP-seq ID
+ # if s in d:
+ # print('ID %s duplicate' % (s))
+ # sys.exit()
+ # d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ # d['ID_LIST'].append(s)
+ # if line.startswith('DESCRIPTION:'):
+ # d[s]['DESCRIPTION'] = get_value(line, ':')
+ # elif line.startswith('PROTEN_NAME:'):
+ # d[s]['PROTEN_NAME'] = get_value(line, ':')
+ # elif line.startswith('PROTEIN_ID:'):
+ # d[s]['PROTEIN_ID'] = get_value(line, ':')
+ # elif line.startswith('DATA_NAME:'):
+ # d[s]['DATA_NAME'] = get_value(line, ':')
+ # elif line.startswith('DATA_FORMAT:'):
+ # d[s]['DATA_FORMAT'] = get_value(line, ':')
+ # elif line.startswith('LOCATION:'):
+ # d[s]['LOCATION'] = get_value(line, ':')
+ # elif line.startswith('NOTE:'):
+ # d[s]['NOTE'] = get_value(line, ':')
+ # elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ # make_local_parameter(d[s]['PARAM'], line)
+
+ # return d
+
+ # Essentially the same as make_data_dict from get_binding.py
+ d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d: # ID is duplicate. Check paramter_for_buildCmatrix.txt
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ if os.path.exists(d[s]['LOCATION']): # include this ID only when its file exists. It could happen the location no longer has that file.
+ d['ID_LIST'].append(s)
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ d['ID_LIST'] = sorted(d['ID_LIST'])
+ return d
+
+
+def make_chip_data(fname):
+ ''' convert each binding column file into a dictionary '''
+ d = {}
+ if not os.path.exists(fname):
+ print('buildCmatrix: Cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ for line in lines[1:]: # ignore first line, such as 'gene_id C0001100007100'
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ v = lst[1]
+ d[g] = v
+
+ return d
+
+
+def get_update_date(s):
+ index = s.find('update:')
+ if index < 0:
+ return '00000000'
+ result = s[s.rfind('update:')+7:].strip()
+ if result.isdigit() and len(result) == 6:
+ return result
+ else:
+ return '00000000'
+
+def make_table(gene_file, data_dict, glb_param_dict):
+ '''
+ Each line in gene file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional)
+ '''
+
+ if glb_param_dict['REBUILD_LIST'] == []: # if not specified, use all
+ id_lst_all = data_dict['ID_LIST']
+ else:
+ id_lst_all = glb_param_dict['REBUILD_LIST']
+
+ # When we build binding.txt, we don' include ChIP data marked with 'obsolete' in its NOTE field.
+ id_lst = []
+ for i in id_lst_all:
+ note = data_dict[i]['NOTE'].lower()
+ curr_date = datetime.now().strftime('%Y%m%d')
+ include_this_id = not 'obsolete' in note \
+ and int(curr_date) - int(get_update_date(data_dict[i]['NOTE'])) < 7 # a ChIP-seq older than 7 days will be ignored.
+ if include_this_id or FORCE_INCLUDE_ALL: # don't include ChIP-seq marked with 'obsolete'
+ id_lst.append(i)
+
+ if id_lst == []:
+ print('buildCmatrix: ChIP-seq ID list is empty.')
+ sys.exit()
+
+ # head line of binding.txt
+ id_str = 'gene_id'
+ for myid in id_lst:
+ id_str += '\t' + myid
+ print(id_str)
+
+ chip_data_dict = {}
+ for myid in id_lst:
+ chip_file = os.path.join(glb_param_dict['DESTINATION'], myid + '.txt') # each binding column file has name such as C0001100007100.txt
+ chip_data = make_chip_data(chip_file)
+ chip_data_dict[myid] = chip_data
+
+ f = open(gene_file)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ gene_id = lst[0]
+ s0 = gene_id
+ for myid in id_lst:
+ if gene_id in chip_data_dict[myid]:
+ s0 += '\t' + chip_data_dict[myid][gene_id]
+ else:
+ s0 += '\t' + '-1'
+ print(s0)
+ f.close()
+
+
+
+## main
+param_file = sys.argv[1]
+FORCE_INCLUDE_ALL = False
+if len(sys.argv) > 2:
+ FORCE_INCLUDE_ALL = sys.argv[2].lower() == 'include-all'
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)
diff --git a/Code/buildRmatrix.py b/Code/buildRmatrix.py
new file mode 100644
index 0000000..b775c77
--- /dev/null
+++ b/Code/buildRmatrix.py
@@ -0,0 +1,234 @@
+# Usage: python buildRmatrix.py paramter_for_buildRmatrix.txt
+# Edit the variable TPM_TABLE for a different output file name.
+# Watch out NA values in TPM.txt, these genes don't have any gene expression information.
+#
+# Purpose: make a TPM table, where each row is a gene, and each column is an experiment. The column name is RNA-seq experiment ID.
+#
+# 23 Dec 2016, hui, slcu
+# Last modified 5 Apr 2017, hui, slcu
+# Last modified 25 Oct 2019, hui, zjnu [Comments; add a variable WARN_NA to turn on/off print NA warnings.]
+
+import os, sys, glob
+
+#TPM_TABLE = '../Data/history/expr/TPM.txt'
+TPM_TABLE = '../Data/history/expr/TPM.txt'
+WARN_NA = False
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@'
+####################################
+
+def common_part(s):
+ ''' s is expected to have this form: AT1G01020.1, remove .1 '''
+ s = s.strip()
+ index = s.find('.')
+ if index < 0: # not found, -1
+ return s
+ return s[0:index]
+
+
+def make_expression_dict(fname, myid):
+ '''
+ fname -- salmon file
+ myid -- RNA-seq experiment ID
+
+ The retured value is a dictionary which looks like
+
+ {
+ 'ID': RNA-seq experiment ID
+ 'isoform':
+ {
+ 'AT1G12345': [],
+ 'AT2G12345': [],
+ ...
+ }
+ }
+
+ Each gene ID (e.g., AT1G12345) has a number of isoforms which gives different expression levels.
+ '''
+
+ ID_COL = 0 # Salmon's quant.sf file, first column is gene ID
+ TPM_COL = 3 # Salmon's quant.sf file, fourth column is TPM
+
+ if not os.path.exists(fname):
+ print('ERROR [buildRmatrix.py]: file %s not exists.' % (fname))
+ sys.exit()
+
+ d = {'ID':myid, 'isoform':{}}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]: # ignore head line, Name Length EffectiveLength TPM NumReads
+ line = line.strip()
+ lst = line.split()
+ gene_id = lst[ID_COL]
+ tpm = float(lst[TPM_COL])
+ common = common_part(gene_id) # gene id without .1, .2, etc.
+ if not common in d['isoform']:
+ d['isoform'][common] = [tpm]
+ else:
+ d['isoform'][common].append(tpm)
+
+ return d
+
+
+def get_max_expressed_isoform(g, d):
+ if not g in d['isoform']:
+ return -9
+ lst = d['isoform'][g]
+ return max(lst)
+
+
+def save_TPM_table(gene_lst, dict_lst, fname):
+ '''
+ gene_lst: a list of genes
+ dict_lst: a list of dictionaries. Each dictionary contains gene expression inforamtion. What is the detailed data structure of each dictionary?
+ fname: where the gene expression level matrix will be saved.
+ '''
+
+ dir_name = os.path.dirname(fname)
+ if not os.path.isdir(dir_name):
+ os.makedirs(dir_name)
+
+ if len(dict_lst) == 0:
+ print('buildRmatrix.py: dict_lst is empty. Nothing to build.')
+ sys.exit()
+
+ f = open(fname, 'w')
+ head = 'gene_id'
+ #print('Merge %d tables.' % (len(dict_lst)))
+ for d in dict_lst:
+ head += '\t' + d['ID'] # d['ID'] is the RNA-seq samples's SRA id
+ f.write('%s\n' % (head))
+ total_count = 0 # number of total gene expression levels
+ bad_count = 0 # number of NA gene expression levels. We wish this number to be far smaller than total_count.
+
+ missed_genes = {}
+ for g in gene_lst:
+ s = g
+ for d in dict_lst:
+ v = get_max_expressed_isoform(g, d)
+ total_count += 1
+ if v != -9:
+ s += '\t' + '%4.2f' % (v)
+ else:
+ if WARN_NA:
+ print('WARNING [buildRmatrix.py]: %s not in %s.' % (g, d['ID']))
+ s += '\t' + 'NA'
+ bad_count += 1
+ missed_genes[g] = 1
+ f.write('%s\n' % (s))
+ f.close()
+
+ if 1.0 * bad_count / total_count > 0.0:
+ print('WARNING [buildRmatrix.py]: %s contains NA values!\n%d out of %d gene expression levels (%4.1f percent) are NAs.\n%d gene IDs are in your gene list but not in the results output by Salmon.' % (fname, bad_count, total_count, 100.0* bad_count/total_count, len(missed_genes)))
+
+
+def get_dict_list(d):
+ ''' A list of dictionaries, each element for one RNA-seq data '''
+ dlst = []
+ for myid in d['ID_LIST']:
+ if myid in d:
+ fname = d[myid]['LOCATION']
+ d2 = make_expression_dict(fname, myid)
+ dlst.append(d2)
+ return dlst
+
+
+def get_gene_list(fname):
+ f = open(fname)
+ lst = []
+ for line in f:
+ line = line.strip()
+ if line != '':
+ l = line.split()[0]
+ lst.append(l)
+ f.close()
+ return lst
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k, v)
+
+
+def get_value(s, delimit):
+ index = s.find(delimit)
+ if index < 0:
+ sys.exit()
+ return s[index+1:].strip()
+
+
+def make_data_dict(fname):
+ '''
+ fname - parameter_for_buildRmatrix.txt
+
+ Return a dictionary which looks like
+
+ {
+ 'ID_LIST': [],
+ 'SRR1':
+ {
+ 'LOCATION': path to the salmon quant file, e.g., /home/lanhui/brain/Data/R/Mapped/public/SRR953400_quant.txt
+ }
+ }
+
+ '''
+ d = {'ID_LIST':[]} # ID_LIST is a list of RNA-seq experiment IDs
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('Warning [buildRmatrix.py]: ID %s is duplicated.' % (s))
+ sys.exit()
+ d[s] = {'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_LIST':''} # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+## main
+param_file = sys.argv[1]
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+TPM_TABLE = os.path.abspath(TPM_TABLE)
+save_TPM_table(get_gene_list(global_param_dict['GENE_LIST']), get_dict_list(data_dict), TPM_TABLE)
+#print('Done. Check %s.' % (TPM_TABLE))
diff --git a/Code/common_peak.py b/Code/common_peak.py
new file mode 100644
index 0000000..c36899c
--- /dev/null
+++ b/Code/common_peak.py
@@ -0,0 +1,119 @@
+# Usage: python common_peak.py dir
+# Purpose: Return common peaks for all narrowPekas in directory dir
+
+import glob
+import sys, os, operator, bisect
+import numpy as np
+
+def check_valid_peak_line(s):
+ s = s.strip()
+ lst = s.split()
+ if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+ return True
+ return False
+
+def make_chip_data(fname):
+ ''' given a file, return a dictionary. key is chromosome number. value is a list. '''
+
+ d = {}
+ if not os.path.exists(fname):
+ print('Cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ if not check_valid_peak_line(lines[0]):
+ return d
+
+ strength_lst = []
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ strength = lst[4]
+ strength_lst.append(int(strength))
+ strength_lst = np.array(strength_lst)
+ tau = np.percentile(strength_lst, 25) # only include strong peaks
+
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ c = lst[0]
+
+ ss = int(lst[1])
+ ee = int(lst[2])
+ strength = lst[4]
+ if int(strength) >= tau:
+ if not c in d:
+ d[c] = [(ss, ee, strength)]
+ else:
+ d[c].append((ss, ee, strength))
+
+ for k in d:
+ d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+ return d
+
+def make_all(dir):
+ d = {}
+ for fname in glob.glob(os.path.join(dir, '*/*/*.narrowPeak')):
+ d[fname] = make_chip_data(fname)
+ return d
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+
+ if len(chip_dict) == 0 or not chromosome in chip_dict:
+ return False
+
+ lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+ n = len(lst)
+ slst, elst, strength_lst = zip(*lst) # make three sub-lists
+ index1 = max(0, bisect.bisect(elst, start_pos)-2) # get a start position
+ index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get a end position
+ sublst = lst[index1:index2]
+ #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+ for t in sublst:
+ ss = t[0]
+ ee = t[1]
+ strength = t[2]
+ if start_pos <= ee and end_pos >= ss:
+ return True
+ #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+ return False
+
+def get_frequent_peaks(d):
+
+ s = ''
+ num_files = len(d)
+ for c in ['chr1', 'chr2', 'chr3', 'chr4', 'chr5']:
+ total_lst = []
+ for k in d: # k is file name
+ if c in d[k]:
+ total_lst.extend(d[k][c])
+ total_lst = sorted(total_lst, key=operator.itemgetter(0, 1))
+ for t in total_lst: # for each interval
+ count = 0
+ for k in d:
+ chromosome = c
+ start_pos = t[0]
+ end_pos = t[1]
+ chip_dict = d[k]
+ result = get_interval_intersection(chromosome, start_pos, end_pos, chip_dict)
+ if result == True:
+ count += 1
+
+ if count >= int(num_files * 0.75):
+ line = '%s\n' % ('\t'.join([c, str(start_pos), str(end_pos), str(count), t[2], '.', '0', '0', '0', '0' ]))
+ s += line
+
+ return s
+
+
+## main
+d = make_all(sys.argv[1])
+fname = sys.argv[1].split('/')[-1] + '.merged.narrowPeak'
+#print(fname)
+s = get_frequent_peaks(d)
+f = open(fname, 'w')
+f.write(s)
+f.close()
diff --git a/Code/configure.py b/Code/configure.py
new file mode 100644
index 0000000..c740e98
--- /dev/null
+++ b/Code/configure.py
@@ -0,0 +1,56 @@
+# From get_TPM_by_salmon.py
+SALMON = '/home/lanhui/brain/Salmon/Salmon-0.7.2_linux_x86_64/bin/salmon' # salmon software path
+SALMON_INDEX = '/home/lanhui/brain/Salmon/salmon_index'
+TRANSCRIPTOME = '/home/lanhui/brain/Salmon/Arabidopsis_thaliana.TAIR10.cdna.all.fa'
+SALMON_MAP_RESULT_DIR = '../Data/temp/salmon_map_result'
+KMER = 31
+
+# From download_and_map.py
+DAILY_MAP_NUMBER = 5 # download this many samples each time. I have tested the values of 3, 4, 5, 8.
+MIN_FASTQ_FILE_SIZE = 200000000 # in bytes, approximately 200MB
+RNA_SEQ_INFO_FILE = '../Data/information/rnaseq_info_database.json' # some data downloaded from ENA are not RNA-seq (they are ChIP-seq). Use this file to tell whether the file is RNA-seq
+DOWNLOADED_SRA_ID_LOG_FILE = '../Data/log/download_log.txt' # a list of downloaded SRA IDs
+IGNORED_SRA_ID_LOG_FILE = '../Data/log/download_log_small_sized_ids.txt' # store SRA IDs with small file size.
+MAPPED_RDATA_DIR = '../Data/R/Mapped/public' # mapped RNA-seq (file names ended with _quant.txt) go here
+RAW_RDATA_DIR = '../Data/R/Raw' # downloaded files go here
+
+
+# From update_network.py
+# Don'T change the following paths and names
+HISTORY_DIR = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
+HISTORY_DIR2 = '../Data/history/edges/one_target' # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
+FILE_TIMESTAMP = '../Data/log/file_timestamp.txt' # record last modified time of several important files
+SAMPLE_SIZE_FILE = '../Data/log/total.samples.txt' # each line contains a date and the number of samples on and after that date
+TEMP_DIR = '../Data/temp'
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
+PARAMETER_FOR_NET = '../Data/parameter/parameter_for_net.txt'
+PARAMETER_FOR_NET_TRAVADB_STRESS = '../Data/parameter/parameter_for_net_travadb_stress.txt'
+PARAMETER_FOR_NET_TRAVADB_MAP = '../Data/parameter/parameter_for_net_travadb_map.txt'
+PARAMETER_FOR_NET_MILD_DROUGHT = '../Data/parameter/parameter_for_net_mild_drought.txt'
+PARAMETER_FOR_NET_WIGGELAB_DIURNAL = '../Data/parameter/parameter_for_net_wiggelab_diurnal.txt'
+
+BINDING_FILE = '../Data/history/bind/binding.txt'
+TPM_FILE = '../Data/history/expr/TPM.txt' # gene expression data
+
+PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL = 1 # check every 28 days for updating TPM.txt
+MIN_RNA_SEQ_INCREASE = 2 # minimum RNA-seq experiments needed when updating parameter_for_buildRmatrix.txt
+UPDATE_NETWORK_LOG_FILE = '../Data/log/update.network.log.txt' # network update log. We should check this file from time to time.
+NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'
+
+RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt' # same as RNA_SEQ_INFO_FILE
+RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
+
+GENE_ID_FIRST_TWO_LETTERS = 'AT'
+MEMORY_STRENGTH = 365 # strength of memory, larger value means better memory
+
+#
+MAPPED_CDATA_DIR = '../Data/C/Mapped' # mapped ChIp-seq data
+
+# Used in merge_edges.py
+EDGE_POOL_DIR = '../Data/history/edge_pool'
+MERGED_EDGE_FILE = '../Data/temp/edges.txt'
+
+
+TARGET_TF_FILE = '../Data/information/target_tf.txt'
diff --git a/Code/correlation_per_group.R b/Code/correlation_per_group.R
new file mode 100644
index 0000000..65f20c5
--- /dev/null
+++ b/Code/correlation_per_group.R
@@ -0,0 +1,142 @@
+# Last modified on 9 Aug 2019 by Hui Lan
+
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TARGET.TF.FILE <- '../Data/information/target_tf.txt'
+AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt'
+r.tau <- 0.60
+min.cluster <- 3 # min number of clusters
+
+
+# Make sure we have required files
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', DATA.FILE))
+}
+
+if (! file.exists(TARGET.TF.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', TARGET.TF.FILE))
+}
+
+if (! file.exists(AGINAME.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', AGINAME.FILE))
+}
+
+
+cat(sprintf('Read %s\n', DATA.FILE))
+X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+all.genes <- rownames(X)
+
+min.sample <- max(50, ceiling(sqrt(dim(X)[2]))) # at least this many samples needed for computing a correlation coefficient
+max.cluster <- min(55, max(min.cluster + 1, ceiling(dim(X)[2]^0.50))) # max number of clusters, depending on total number of samples
+
+
+# Filter genes
+rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average
+sd.val <- apply(X, 1, sd)
+lambda <- 0.3
+#sd.tau <- lambda * summary(sd.val)[3] + (1-lambda) * summary(sd.val)[5] # genes whose gene expression varies least are to be filtered
+sd.tau <- 1
+index.row <- rowSums(X) > rowsum.tau & sd.val > sd.tau & !is.na(sd.val)
+
+X <- log(X[index.row, ] + 1.0)
+
+# Normalize each row such that its mean is 0 and standard deviation is 1
+normalize <- function(X) {
+ d <- dim(X)
+ num_row <- d[1]
+ num_col <- d[2]
+
+ s <- apply(X, 1, sd)
+ S <- matrix(rep(s, num_col), nrow=num_row)
+ m <- apply(X, 1, mean)
+ M <- matrix(rep(m, num_col), nrow=num_row)
+ X <- (X - M)/S
+}
+
+X2 <- normalize(X)
+
+cat(sprintf('Read %s\n', AGINAME.FILE))
+agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+
+cat(sprintf('Read %s\n', TARGET.TF.FILE))
+target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
+total.pair <- dim(target.tf)[1]
+
+cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
+cat('Hclust ...\n')
+clusters <- hclust(dist(t(X2)), method = 'average')
+cat('Go through pairs..\n')
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:total.pair) {
+
+ gene.tf <- as.vector(target.tf[i,2])
+ gene.target <- as.vector(target.tf[i,1])
+ all.in <- gene.tf %in% all.genes & gene.target %in% all.genes
+ if (!all.in) {
+ next
+ }
+ if (!gene.tf %in% rownames(X) || !gene.target %in% rownames(X)) { # make sure both gene.tf and gene.target are in X
+ next
+ }
+
+ # if too few rnaseq samples, or correlation on all rnaseq samples is good, don't look for group correlation
+ x <- as.vector(t(X[gene.tf, ]))
+ y <- as.vector(t(X[gene.target, ]))
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x.1 <- x[!index]
+ y.1 <- y[!index]
+ if (length(x.1) < min.sample) {
+ next
+ } else if (cor(x.1, y.1) >= r.tau) {
+ next
+ }
+
+
+ name1 <- agi$V2[which(agi$V1 == gene.tf)]
+ name2 <- agi$V2[which(agi$V1 == gene.target)]
+
+ # initial values
+ max.r <- 0.0
+ max.n <- 0
+ max.samples <- c()
+
+ # cut tree into different number of clusters
+ for (cn in seq(min.cluster, max.cluster, 2)) { # cn is number of clusters
+ cut <- cutree(clusters, cn)
+ sample.names <- names(cut)
+ for (c in unique(cut)) { # each cluster
+ sample.index <- (cut == c)
+ x <- as.vector(t(X[gene.tf, sample.index]))
+ y <- as.vector(t(X[gene.target, sample.index]))
+ n <- length(x)
+ if (n > min.sample & sd(x) > 0.1 & sd(y) > 0.1) { # both x and y should vary
+ r <- cor(x, y)
+ } else {
+ r <- 0.0
+ }
+
+ if (n > min.sample & abs(r) > r.tau & n > max.n) {
+ max.r <- r
+ max.n <- n
+ max.samples <- sample.names[sample.index]
+ }
+ }
+ }
+
+ # save results
+ if (max.n > 0) {
+ curr.date <- gsub('-','',Sys.Date())
+ loglik <- '-991.0'
+ sub.cond <- paste(max.samples, collapse=' ')
+ num.sub.cond <- length(max.samples)
+ cond <- as.vector(target.tf[i,3])
+ result <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.r, 'hclust.group')
+ cat(result, file=f, sep='')
+ }
+}
+
+close(f) \ No newline at end of file
diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R
new file mode 100644
index 0000000..3f48220
--- /dev/null
+++ b/Code/correlation_per_group_fixed_number.R
@@ -0,0 +1,217 @@
+# Last modified on 9 Aug 2019
+# Last modified on 11 Aug 2019
+
+# Purpose: divide the samples into fixed number of groups and compute
+# correlation on each group. The optimal number of groups is
+# determined using tissue labels, to maximize the agreement between
+# the groups and tissue label. More specifically, within each group
+# there should be as few distinct tissues as possible.
+
+TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TARGET.TF.FILE <- '../Data/information/target_tf.txt'
+AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt'
+r.tau <- 0.50
+min.cluster <- 3 # min number of clusters
+
+
+if (!file.exists(TISSUE.FILE)) {
+ stop(sprintf('The file %s dose not exists. So I cannot compute fixed number of sample groups.', TISSUE.FILE))
+}
+
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', DATA.FILE))
+}
+
+if (! file.exists(TARGET.TF.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', TARGET.TF.FILE))
+}
+
+if (! file.exists(AGINAME.FILE)) {
+ stop(sprintf('[correlation_per_group.R] Unable to find %s', AGINAME.FILE))
+}
+
+
+cat(sprintf('Read %s\n', DATA.FILE))
+X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+all.genes <- rownames(X)
+
+
+min.sample <- max(50, ceiling(sqrt(dim(X)[2]))) # at least this many samples needed for computing a correlation coefficient. r=0.6 on 50 samples has two-tailed p-value 0.000004. http://vassarstats.net/tabs_r.html
+max.cluster <- min(100, max(min.cluster + 1, ceiling(dim(X)[2]^0.50))) # max number of clusters, depending on total number of samples
+
+
+# Filter genes
+rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average
+sd.val <- apply(X, 1, sd)
+lambda <- 0.3
+#sd.tau <- lambda * summary(sd.val)[3] + (1-lambda) * summary(sd.val)[5] # genes whose gene expression varies least are to be filtered
+sd.tau <- 1
+index.row <- rowSums(X) > rowsum.tau & sd.val > sd.tau & !is.na(sd.val)
+
+X <- log(X[index.row, ] + 1.0)
+
+
+# Normalize each row such that its mean is 0 and standard deviation is 1
+normalize <- function(X) {
+ d <- dim(X)
+ num_row <- d[1]
+ num_col <- d[2]
+
+ s <- apply(X, 1, sd)
+ S <- matrix(rep(s, num_col), nrow=num_row)
+ m <- apply(X, 1, mean)
+ M <- matrix(rep(m, num_col), nrow=num_row)
+ X <- (X - M)/S
+}
+
+
+# Choose the optimal number of clusters such that they have best agreement with tissue labels
+# Added on 28 June 2017, slcu, hui
+get.optimal.number.of.clusters <- function(X, clusters, tissue.matrix, min.cluster, max.cluster) {
+ labels <- as.vector(tissue.matrix$suggested.tissue)
+ labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+ tissue.label <- c()
+ for (rseqid in colnames(X)) { # X is the gene expression matrix
+ i <- which(as.vector(tissue.matrix$run.id) == rseqid) # tissue.matrix contains tissue information for each RNA-seq ID
+ suggested.tissue.name <- labels[i]
+ tissue.label <- c(tissue.label, suggested.tissue.name)
+ }
+ best.cn <- min.cluster
+ best.mix.rate <- 0 # perfect mix rate is 1.0
+ for (cn in seq(min.cluster, max.cluster, 1)) { # cn is number of clusters
+ cut <- cutree(clusters, cn)
+ mix.sum <- 0
+ mix.count <- 0
+ for (c in unique(cut)) { # each cluster
+ sample.index <- (cut == c)
+ t <- tissue.label[sample.index]
+ mix.sum <- mix.sum + max(as.data.frame(table(t))$Freq)/sum(as.data.frame(table(t))$Freq)
+ mix.count <- mix.count + 1
+ }
+ mix.rate <- log10(length(tissue.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small.
+ #cat(sprintf('get.optimal.number.of.clusters: %d\t%4.1f\t%4.2f\t%4.2f\n', cn, length(tissue.label)/mix.count, mix.sum/mix.count, mix.rate))
+ if (mix.rate > best.mix.rate) {
+ best.mix.rate <- mix.rate
+ best.cn <- cn
+ }
+ }
+ result <- list(cn=best.cn, mix.rate=best.mix.rate)
+}
+
+
+cat(sprintf('Read %s\n', AGINAME.FILE))
+agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+
+cat(sprintf('Read %s\n', TARGET.TF.FILE))
+target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
+total.pair <- dim(target.tf)[1]
+
+cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
+cat('Hclust ...\n')
+X2 <- normalize(X2) # each row of X2 has mean 0 and standard deviation 1.
+clusters <- hclust(dist(t(X2)), method = 'average')
+cat(sprintf('Determine optimal number of clusters ...\n'))
+tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster)
+cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate))
+cut <- cutree(clusters, cn.result$cn)
+sample.names <- names(cut)
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
+f <- file(output.file, 'w')
+cat(sprintf('Go through %d pairs...\n', total.pair))
+
+for (i in 1:total.pair) {
+
+ gene.tf <- as.vector(target.tf[i,2])
+ gene.target <- as.vector(target.tf[i,1])
+ all.in <- gene.tf %in% all.genes & gene.target %in% all.genes
+ if (!all.in) {
+ next
+ }
+ if (!gene.tf %in% rownames(X) || !gene.target %in% rownames(X)) { # make sure both gene.tf and gene.target are in X
+ next
+ }
+
+ # if too few rnaseq samples, or correlation on all rnaseq samples is good, don't look for group correlation
+ x <- as.vector(t(X[gene.tf, ]))
+ y <- as.vector(t(X[gene.target, ]))
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x.1 <- x[!index]
+ y.1 <- y[!index]
+ if (length(x.1) < min.sample) {
+ next
+ } else if (cor(x.1, y.1) >= r.tau) {
+ next
+ }
+
+ name1 <- agi$V2[which(agi$V1 == gene.tf)]
+ name2 <- agi$V2[which(agi$V1 == gene.target)]
+
+ # initial values
+ max.pos.r <- 0.0
+ max.pos.n <- 0
+ max.pos.samples <- c()
+ max.neg.r <- 0.0
+ max.neg.n <- 0
+ max.neg.samples <- c()
+
+ # cut tree into different number of clusters
+ for (c in unique(cut)) { # each cluster
+ sample.index <- (cut == c)
+ x <- as.vector(t(X[gene.tf, sample.index]))
+ y <- as.vector(t(X[gene.target, sample.index]))
+ n <- length(x)
+ if (n > min.sample & sd(x) > 0.1 & sd(y) > 0.1) { # both x and y should vary
+ r <- cor(x, y)
+ } else {
+ r <- 0.0
+ }
+
+ if (n > min.sample & r > r.tau & n > max.pos.n) {
+ max.pos.r <- r
+ max.pos.n <- n
+ max.pos.samples <- sample.names[sample.index]
+ }
+
+ if (n > min.sample & r < -r.tau & n > max.neg.n) {
+ max.neg.r <- r
+ max.neg.n <- n
+ max.neg.samples <- sample.names[sample.index]
+ }
+
+ }
+
+ # save results
+ curr.date <- gsub('-','',Sys.Date())
+ loglik <- '-991.1'
+ cond = as.vector(target.tf[i,3])
+ result.1 <- ''
+ result.2 <- ''
+ if (max.pos.n > 0) {
+ sub.cond <- paste(max.pos.samples, collapse=' ')
+ num.sub.cond <- length(max.pos.samples)
+ result.1 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.pos.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.pos.r, 'hclust.fixed.group')
+ }
+ if (max.neg.n > 0) {
+ sub.cond <- paste(max.neg.samples, collapse=' ')
+ num.sub.cond <- length(max.neg.samples)
+ result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group')
+ }
+ if (result.1 != '' | result.2 != '') {
+ if (result.1 != '' & result.2 != '') {
+ result <- paste(result.1, result.2, sep='')
+ } else if (result.1 != '') {
+ result <- result.1
+ } else if (result.2 != '') {
+ result <- result.2
+ }
+ cat(result, file=f, sep='')
+ }
+}
+
+close(f)
diff --git a/Code/correlation_per_tissue.R b/Code/correlation_per_tissue.R
new file mode 100644
index 0000000..d9aadf9
--- /dev/null
+++ b/Code/correlation_per_tissue.R
@@ -0,0 +1,101 @@
+# Last modified on 8 Aug 2019
+
+TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TEMP.DIR <- '../Data/temp'
+TARGET.FILE <- '../Data/temp/all_targets.txt'
+TF.FILE <- '../Data/temp/all_tfs.txt'
+tau <- 0.60
+
+
+# Make sure we have required files
+if (! file.exists(TISSUE.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE))
+}
+
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE))
+}
+
+if (! dir.exists(TEMP.DIR)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR))
+}
+
+if (! file.exists(TARGET.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE))
+}
+
+if (! file.exists(TF.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE))
+}
+
+
+X0 <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X0$gene_id
+X0$gene_id <- NULL # remove column gene_id
+row.names(X0) <- all.id # add row names
+all_genes <- rownames(X0)
+
+tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels <- as.vector(tissue$suggested.tissue)
+labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+unique.label <- unique(labels)
+
+targets <- read.table(TARGET.FILE, header=FALSE)
+tfs <- read.table(TF.FILE, header=FALSE)
+targets <- as.vector(targets$V1)
+tfs <- as.vector(tfs$V1)
+
+
+##############################################################
+get.index <- function(X, tissue.matrix, tissue.name) {
+ labels <- as.vector(tissue.matrix$suggested.tissue)
+ labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+ index <- c()
+ count <- 1
+ for (rseqid in colnames(X)) {
+ i <- which(as.vector(tissue.matrix$run.id) == rseqid)
+ if (length(i) != 0) {
+ suggested.tissue.name <- labels[i]
+ if (tissue.name == suggested.tissue.name) {
+ index <- c(index, count)
+ }
+ }
+ count <- count + 1
+ }
+ index
+}
+##############################################################
+
+
+# for each tissue type, get a correlation matrix
+for (ul in unique.label) {
+ index.rnaseq <- get.index(X0, tissue, ul)
+ if (length(index.rnaseq) >= 50) {
+ OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.')
+
+ X <- as.matrix(X0[, index.rnaseq])
+ sd.1 <- apply(X, 1, sd) # sd of each row
+ s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
+ sd.tau <- (quantile(sd.1)[1] + quantile(sd.1)[2]) / 2.0 # min SD
+ good <- sd.1 > max(sd.tau, 0.05)
+ tf_good <- which( good & (all_genes %in% tfs) == T )
+ target_good <- which( good & (all_genes %in% targets) == T )
+
+ # Compute correlation coefficient
+ X <- log(X + 1)
+ X[X<0.01] <- NA
+ if (length(tf_good) > 1) {
+ c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
+ } else {
+ c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs')
+ }
+ index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99
+ row_names <- rownames(c)
+ col_names <- colnames(c)
+ result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index)))
+
+ # write results
+ write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F)
+ }
+}
diff --git a/Code/count_word.py b/Code/count_word.py
new file mode 100644
index 0000000..cd1d3c6
--- /dev/null
+++ b/Code/count_word.py
@@ -0,0 +1,36 @@
+# Usage: python count_word.py /home/hui/network/v03/Data/information/rnaseq_info_database.txt
+#
+# Purpose: get all words in a file, order them by their frequencies.
+#
+# 20 Apr 2017, slcu, hui
+
+FILE_NAME = '/home/hui/network/v03/Data/information/rnaseq_info_database.txt'
+
+from collections import Counter
+import operator
+import string
+import sys
+
+FILE_NAME = sys.argv[1]
+
+def remove_punctuation(s):
+ result = ''
+ for x in s.split():
+ result += ' ' + x.translate(None, string.punctuation)
+ return result.strip()
+
+f = open(FILE_NAME)
+lines = f.readlines()
+f.close()
+yourtext = ''
+for line in lines[1:]: # don't include header line
+ line = line.strip()
+ lst = line.split('\t')
+ for x in lst[4:]: # only consider fields from 5th column.
+ yourtext += ' ' + remove_punctuation(x.lower())
+
+d = Counter(yourtext.split())
+sd = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+for t in sd:
+ k = t[0]
+ print('%s\t\t%s' % (k, d[k]))
diff --git a/Code/create_edges.py b/Code/create_edges.py
new file mode 100644
index 0000000..bacf179
--- /dev/null
+++ b/Code/create_edges.py
@@ -0,0 +1,840 @@
+# Usage: python create_edges.py parameter_for_net.txt > edges.txt.20170227_1618
+#
+# 01 DEC 2016, hui
+
+import sys, os, operator, itertools
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.stats as stat
+from datetime import datetime
+
+import rpy2.robjects as r
+from rpy2.robjects.packages import importr
+from rpy2.robjects import FloatVector
+
+import warnings
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+from param4net import make_global_param_dict
+
+####### Utility files #############
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+TOP_N_TF = 50
+MIN_NUM_CONDITION = 20
+SIGNAL_INPUT_RATIO_TAU = 1.5
+REMOVE_HORIZONTAL_STRIP_TAU = 0.05
+REMOVE_VERTICAL_STRIP_TAU = 0.05
+MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30
+####################################
+
+def get_two_components(y, x):
+ K = 2
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 25
+ random_restarts = 2
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, 'IGNORE'
+ # print('DEBUG')
+ # print(y)
+ # print(x)
+ # print(type(y))
+ # print(type(x))
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = np.array_str(x) + ',' + np.array_str(y)
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, 'IGNORE'
+
+ x = x[index]
+ y = y[index]
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(x, axis=1), np.expand_dims(y, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = (model.gamma[:,0] > model.gamma[:,1]) # model.gamma is a vector of posterior probabilities
+ idx2 = (model.gamma[:,1] > model.gamma[:,0])
+ return idx1, idx2, warn_msg
+
+
+def get_three_components(y, x, cond_lst):
+ K = 3
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 50
+ random_restarts = 5
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(xx, axis=1), np.expand_dims(yy, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = np.array(model.gamma[:,0] > model.gamma[:,1]) & np.array(model.gamma[:,0] > model.gamma[:,2]) # model.gamma is a vector of posterior probabilities
+ idx2 = np.array(model.gamma[:,1] > model.gamma[:,0]) & np.array(model.gamma[:,1] > model.gamma[:,2])
+ idx3 = np.array(model.gamma[:,2] > model.gamma[:,0]) & np.array(model.gamma[:,2] > model.gamma[:,1])
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def get_three_components_and_evaluate(y, x, cond_lst):
+ K = 3
+ epsilon = 1e-4
+ lam = 0.1
+ iterations = 50
+ random_restarts = 5
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+ # Train the model
+ model = LinearRegressionsMixture(np.expand_dims(xx, axis=1), np.expand_dims(yy, axis=1), K=K)
+ model.train(epsilon=epsilon, lam=lam, iterations=iterations, random_restarts=random_restarts, verbose=False)
+ idx1 = np.array(model.gamma[:,0] > model.gamma[:,1]) & np.array(model.gamma[:,0] > model.gamma[:,2]) # model.gamma is a vector of posterior probabilities
+ idx2 = np.array(model.gamma[:,1] > model.gamma[:,0]) & np.array(model.gamma[:,1] > model.gamma[:,2])
+ idx3 = np.array(model.gamma[:,2] > model.gamma[:,0]) & np.array(model.gamma[:,2] > model.gamma[:,1])
+ rmse_avg, rmse_std = model.cross_validate(k_fold=10, verbose=False, silent=True)
+ warn_msg = 'rmse_avg=%4.2f,rmse_sd=%4.2f' % (rmse_avg, rmse_std)
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def get_three_components_mixtools(y, x, cond_lst):
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+ # Train the model
+ mixtools = importr('mixtools')
+ try:
+ result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100)
+ except:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+ posterior = result[result.names.index('posterior')]
+ posterior = np.array(posterior)
+ l = np.argmax(posterior, axis=1) # class information
+ idx1 = l == 0
+ idx2 = l == 1
+ idx3 = l == 2
+ warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0])
+
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+def read_info_data(fname):
+ ''' Read chip-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_related_condition(s, info_dict):
+ lst = s.split(';')
+ result = [] # a list of sample IDs
+ result = info_dict['ID_LIST'] # TBD
+ return result
+
+
+def update_global_param_dict(glb_param_dict, info_dict):
+ if glb_param_dict['RESEARCH_KEYWORDS'] == '':
+ glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST']
+ glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict)
+
+
+def get_threshold(lst):
+ x = np.array(lst)
+ x = x[x > 0]
+ return np.median(x)
+
+
+def get_threshold2(lst, glb_param_dict):
+ x = np.array(lst)
+ x = x[x > 0]
+ max_num = int(glb_param_dict['MAX_NUM_TARGETS'])
+ percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE'])
+ n = len(x)
+ if n < max_num:
+ return x[-1]
+ else: # include some overflowing targets, but not all
+ overflow = n - max_num
+ keep = int(overflow * percent)
+ index = keep + max_num
+ return x[index]
+
+def get_tf(g, bind_dict, info_dict, input_dict, glb_param_dict):
+ tf_dict = {}
+ d = bind_dict['xy']
+ input_d = input_dict['xy']
+ input_cond = input_dict['colid'][0] # use the first column as input (improve)
+ if g in d.keys():
+ for c in bind_dict['colid']:
+ bind_val = d[g][c]
+ if info_dict[c]['DATA_FORMAT'].upper() == 'BW':
+ input_val = input_d[g][input_cond]
+ if g == 'AT1G65480': # FT, target is FT
+ #print('DEBUG target:%s protein=%s bv=%g, iv=%g, ratio=%g' % (g, info_dict[c]['PROTEIN_ID'], bind_val, input_val, bind_val/input_val))
+ pass
+ if input_val > 0 and input_val < 10000 and (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU: # input_val should also be not too large
+ g2 = info_dict[c]['PROTEIN_ID']
+ if g2 != '':
+ if not g2 in tf_dict:
+ tf_dict[g2] = [c]
+ else:
+ tf_dict[g2].append(c)
+ elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ #tau = bind_dict['yy_sorted'][c][TOP_N_TF]
+ tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict)
+ #print('DEBUG target=%s %s %g >= %g' % (g, info_dict[c]['PROTEIN_ID'], bind_val, tau))
+ if bind_val >= tau: # change later
+ g2 = info_dict[c]['PROTEIN_ID']
+ if g2 != '':
+ if not g2 in tf_dict:
+ tf_dict[g2] = [c]
+ else:
+ tf_dict[g2].append(c)
+
+ return tf_dict
+
+
+
+def get_gene_expression(gene_id, cond_lst, expr_dict, takelog=False):
+
+ num_cond = len(cond_lst)
+ elst = [None]*num_cond
+ d = expr_dict['xy']
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = d[gene_id][c]
+ if takelog == True:
+ elst[i] = np.log(x+1)
+ else:
+ elst[i] = x
+ return np.array( elst )
+
+
+def float_equal(x, y):
+ return np.abs(x-y) < 0.001
+
+
+def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ clst = [None]*num_cond
+ d = expr_dict['xy']
+ j = 0
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c))
+ #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[j] = np.log(x+1)
+ elst2[j] = np.log(y+1)
+ else:
+ elst1[j] = x
+ elst2[j] = y
+ clst[j] = c
+ j += 1
+ return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] )
+
+
+
+def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ '''
+ get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored.
+ in addition, vertical strip and horizontal strip are removed.
+ '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ mark_cond = [True]*num_cond # indicate if a condition should be included
+ clst = []
+ d = expr_dict['xy']
+
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[i] = np.log(x+1)
+ elst2[i] = np.log(y+1)
+ else:
+ elst1[i] = x
+ elst2[i] = y
+ if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip
+ mark_cond[i] = False
+ else:
+ clst.append(c)
+ else:
+ mark_cond[i] = False
+
+ a = np.array(elst1)
+ a = a[mark_cond]
+ b = np.array(elst2)
+ b = b[mark_cond]
+ return (a.astype(np.float64), b.astype(np.float64), clst)
+
+
+#################### select condition stuff ###############
+
+def get_yhat(slope, intercept, x):
+ yhat = intercept + slope * x
+ return yhat
+
+def select_points_theil(x, y, max_diff):
+ theil_result = stat.mstats.theilslopes(x, y)
+ slope = theil_result[0]
+ intercept = theil_result[1]
+ yhat = get_yhat(slope, intercept, x)
+ d = y - yhat
+ d_abs = np.abs(d)
+ index = d_abs < max_diff
+ return (x[index], y[index], index)
+
+def common_elements(list1, list2):
+ return sorted(list(set(list1).intersection(list2)))
+
+def two_points(p1, p2):
+ '''Return slope and intercept '''
+ x1, y1 = p1
+ x2, y2 = p2
+ m = (y2 - y1) / (x2 - x1)
+ b = y1 - m * x1
+ return (m, b)
+
+def select_points_diagonal(x, y, max_diff, direction):
+ ''' positive direction '''
+
+ n = len(x)
+ if n < 3:
+ return (x, y)
+
+ if direction.lower() == 'pos':
+ index_x = np.argsort(x)
+ elif direction.lower() == 'neg':
+ index_x = np.argsort(-1*x)
+ else:
+ print('%s must be pos or neg' % (direction))
+ sys.exit()
+
+ index_y = np.argsort(y)
+
+ # get lower (or upper) end point
+ idx = None
+ for i in range(2,n+1): # get common index
+ s1 = index_x[0:i]
+ s2 = index_y[0:i]
+ s = common_elements(s1, s2)
+ if s != []:
+ idx = s[0]
+ break
+ p1 = (x[idx], y[idx])
+
+ # get upper (or lower) end point
+ index_x = list(reversed(index_x)) # reverse list
+ index_y = list(reversed(index_y))
+
+ idx = None
+ for i in range(2,n+1): # get common index
+ s1 = index_x[0:i]
+ s2 = index_y[0:i]
+ s = common_elements(s1, s2)
+ if s != []:
+ idx = s[0]
+ break
+ p2 = (x[idx], y[idx])
+
+ slope, intercept = two_points(p1, p2)
+
+ yhat = get_yhat(slope, intercept, x)
+ d = y - yhat
+ d_abs = np.abs(d)
+ try:
+ index = d_abs < max_diff
+ except RuntimeWarning:
+ print(d_abs)
+ print(max_diff)
+ sys.exit()
+
+ return (x[index], y[index], index)
+
+
+def correlation_is_significant(r, p, glb_param_dict):
+ return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF'])
+
+
+def subset_list(lst, bool_index):
+ if len(lst) != len(bool_index):
+ print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index))
+ sys.exit()
+
+ n = len(lst)
+ result = []
+ for i in range(n):
+ if bool_index[i] == True:
+ result.append(lst[i])
+ return result
+
+
+def print_dict(d, glb_param_dict):
+ agi2name_dict = glb_param_dict['name_conversion_dict']
+ curr_date = datetime.now().strftime('%Y%m%d') # add date to the end of each line, for future reference or filtering
+ if d['type'] == 'two-way':
+ gene_id = d['target']
+ head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ gene_id = d['TF']
+ head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ d2 = d['significant']
+ if 'all' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.')
+ s += '\t' + curr_date
+ print(head + s)
+ sys.stdout.flush() # flush to stdout, so we can see the results immedialtely.
+ if 'user' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['user']['score'], 'user', ' '.join(d2['user']['signal_set']), ' '.join(d2['user']['chip_id']), '.')
+ print(head + s)
+ if 'pos' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['pos']['score'], 'pos', ' '.join(d2['pos']['signal_set']), ' '.join(d2['pos']['chip_id']), '.')
+ print(head + s)
+ if 'neg' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['neg']['score'], 'neg', ' '.join(d2['neg']['signal_set']), ' '.join(d2['neg']['chip_id']), '.')
+ print(head + s)
+ if 'mix' in d2:
+ n = len(d2['mix']['score'])
+ for i in range(n):
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i])
+ s += '\t' + curr_date
+ print(head + s)
+ sys.stdout.flush() # flush to stdout, so we can see the results immedialtely.
+
+
+def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict):
+ '''
+
+ Check if target has relationship with each of TFs.
+
+ tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST)
+
+ Return a list of dictionaries. Each dictionary has the following format:
+
+ 'type' :'two-way'
+ 'target':''
+ 'TF' :''
+ 'significant': {
+ 'all': {'signal_set':[], score=2, chip_id:''}
+ 'pos_direction':{'signal_set':[], score:.0, chip_id:''}
+ 'neg_direction':{'signal_set':[], score:.1, chip_id:''}
+ 'user_defined': {'signal_set':[], score:.3, chip_id:''}
+ 'mix': {'signal_set':[], score:[.7,-.5], chip_id:''}
+ }
+
+ '''
+
+ result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair
+
+ target_gene_id = target
+ all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples. TBD, can be glb_param_dict['USER_CONDITION_LIST']
+ logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values
+ #target_elst = get_gene_expression(target_gene_id, all_cond_lst, expr_dict, takelog=logrithmize) # a list of gene expression levels
+
+ if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything
+ return result_dict_lst
+
+ for tf_gene_id in sorted(tf_dict.keys()): # y is a TF gene id
+
+ chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000
+
+ if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything
+ continue
+
+ # get gene expression profiles for target and TF. If in a RNA-seq sample, both target and TF is 0, then this sample is ignored.
+ target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst, tf_elst)
+
+ d = {}
+ d['target'] = target_gene_id
+ d['TF'] = tf_gene_id
+ d['type'] = 'two-way'
+ d['significant'] = {}
+
+ all_good = False
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['all'] = {}
+ d['significant']['all']['signal_set'] = clist # a list of sample IDs, returned by get_gene_expression3
+ d['significant']['all']['score'] = r
+ d['significant']['all']['chip_id'] = chip_id
+ all_good = True
+
+ user_cond_lst = glb_param_dict['USER_CONDITION_LIST']
+ if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []:
+ target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst_user, tf_elst_user)
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['user'] = {}
+ d['significant']['user']['signal_set'] = user_cond_lst
+ d['significant']['user']['score'] = r
+ d['significant']['user']['chip_id'] = chip_id
+
+ # obsolete
+ max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF']
+ if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES':
+ aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos')
+ r_pos, p_pos = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION:
+ d['significant']['pos'] = {}
+ d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos)
+ d['significant']['pos']['score'] = r_pos
+ d['significant']['pos']['chip_id'] = chip_id
+
+ # obsolete
+ if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES':
+ aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg')
+ r_neg, p_neg = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION:
+ d['significant']['neg'] = {}
+ d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg)
+ d['significant']['neg']['score'] = r_neg
+ d['significant']['neg']['chip_id'] = chip_id
+
+ K = int(glb_param_dict['NUMBER_OF_COMPONENTS'])
+ if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results
+ if K == 2: # for now consider two components
+ #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst)))
+ #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist)))
+ index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ aa = target_elst[index1]
+ bb = tf_elst[index1]
+ r_mix1, p_mix1 = stat.pearsonr(aa, bb)
+ aa = target_elst[index2]
+ bb = tf_elst[index2]
+ r_mix2, p_mix2 = stat.pearsonr(aa, bb)
+ #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2))
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict)
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict)
+ if flag1 or flag2:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ if flag1:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index1))
+ d['significant']['mix']['score'].append(r_mix1)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index2))
+ d['significant']['mix']['score'].append(r_mix2)
+
+ if K == 3: # three components
+ aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ r_mix1, p_mix1 = stat.pearsonr(aa1, bb1)
+ r_mix2, p_mix2 = stat.pearsonr(aa2, bb2)
+ r_mix3, p_mix3 = stat.pearsonr(aa3, bb3)
+ #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id))
+ #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1))
+ #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2))
+ #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3))
+ #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3)))
+ min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS'])
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points
+ flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points
+ if flag1 or flag2 or flag3:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ d['significant']['mix']['message'] = []
+ if flag1:
+ d['significant']['mix']['signal_set'].append(cond1)
+ d['significant']['mix']['score'].append(r_mix1)
+ d['significant']['mix']['message'].append(msg)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(cond2)
+ d['significant']['mix']['score'].append(r_mix2)
+ d['significant']['mix']['message'].append(msg)
+ if flag3:
+ d['significant']['mix']['signal_set'].append(cond3)
+ d['significant']['mix']['score'].append(r_mix3)
+ d['significant']['mix']['message'].append(msg)
+
+ if len(d['significant']) > 0: # significant edges exist
+ print_dict(d, glb_param_dict)
+ #result_dict_lst.append(d)
+
+ return result_dict_lst
+
+
+def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict):
+ ''' TBD '''
+ return []
+
+
+def establish_edges(expr_dict, expr_info_dict, bind_dict, bind_info_dict, input_dict, glb_param_dict):
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split()
+ gene_lst = get_gene_list(glb_param_dict['GENE_LIST'])
+ final_gene_lst = list(set(high_gene_lst)) # unique genes
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ update_global_param_dict(glb_param_dict, expr_info_dict)
+ result_d = {'two_way_edges':{}, 'three_way_edges':{}}
+ for g in final_gene_lst:
+ tf_dict = get_tf(g, bind_dict, bind_info_dict, input_dict, glb_param_dict)
+ if len(tf_dict) > 0:
+ key = g
+ if glb_param_dict['TWO_WAY'] == 'YES':
+ two_dict = two_way(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)
+ result_d['two_way_edges'][key] = two_dict
+ if glb_param_dict['THREE_WAY'] == 'YES':
+ three_dict = three_way(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict)
+ result_d['three_way_edges'][key] = three_dict
+
+ return result_d
+
+
+def dumpclean(obj):
+ '''
+ show dictionary content, recursively
+ '''
+ if type(obj) == dict:
+ for k, v in obj.items():
+ if hasattr(v, '__iter__'):
+ print(k)
+ dumpclean(v)
+ else:
+ print('%s : %s' % (k, v))
+ elif type(obj) == list:
+ for v in obj:
+ if hasattr(v, '__iter__'):
+ dumpclean(v)
+ else:
+ print(v)
+ else:
+ print(obj)
+
+
+# obsolete
+def print_dict_list(dict_lst, agi2name_dict):
+ for d in dict_lst:
+ #dumpclean(d)
+ if d['type'] == 'two-way':
+ gene_id = d['target']
+ head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ gene_id = d['TF']
+ head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ d2 = d['significant']
+ if 'all' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']))
+ print(head + s)
+ if 'user' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['user']['score'], 'user', ' '.join(d2['user']['signal_set']), ' '.join(d2['user']['chip_id']))
+ print(head + s)
+ if 'pos' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['pos']['score'], 'pos', ' '.join(d2['pos']['signal_set']), ' '.join(d2['pos']['chip_id']))
+ print(head + s)
+ if 'neg' in d2:
+ s = '%4.2f\t%s\t%s\t%s' % (d2['neg']['score'], 'neg', ' '.join(d2['neg']['signal_set']), ' '.join(d2['neg']['chip_id']))
+ print(head + s)
+ if 'mix' in d2:
+ n = len(d2['mix']['score'])
+ for i in range(n):
+ s = '%4.2f\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']))
+ print(head + s)
+
+def print_result(d, agi2name_dict):
+ for k in d:
+ print(k) # two-way or three-way
+ d2 = d[k]
+ for k2 in d2: # k2 is a gene
+ dlst = d2[k2]
+ print_dict_list(dlst, agi2name_dict)
+
+
+
+########## main ##################################################
+r.r['options'](warn=-1) # supress warning message from rpy2
+warnings.filterwarnings("ignore")
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+glb_param_dict['name_conversion_dict'] = agi2name_dict
+#print('Read expression data')
+expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
+#print('DEBUG at2g07754 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO'])
+#print('Read binding data')
+bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # newly added, for comparing with bw files
+#print('Establish edges')
+edge_d = establish_edges(expr_dict, expr_info_dict, bind_dict, bind_info_dict, input_dict, glb_param_dict)
+#print_result(edge_d, agi2name_dict)
diff --git a/Code/create_edges.r b/Code/create_edges.r
new file mode 100644
index 0000000..1e80447
--- /dev/null
+++ b/Code/create_edges.r
@@ -0,0 +1,99 @@
+TARGET_TF_FILE <- "../Data/information/target_tf.txt"
+DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table
+AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt"
+CORR_THRESHOLD <- 0.6
+MIN_SIZE <- 100
+
+####### Read data #########################################
+X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
+gene_id <- X$gene_id
+X$gene_id <- NULL
+row.names(X) <- gene_id
+X <- as.matrix(X)
+rna.sample.id <- colnames(X)
+
+target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
+target_tf <- as.matrix(target_tf)
+targets <- target_tf[,1]
+tfs <- target_tf[,2]
+conditions <- target_tf[,3]
+
+agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+
+library(mixtools)
+result.file <- '../Data/history/edges/many_targets/edges.txt.20170306_1015'
+for (i in 1:length(targets)) {
+ out <- file(result.file, 'a')
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ id2 <- targets[i]
+ if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
+ next
+ }
+ name1 <- agi[id1,1]
+ name2 <- agi[id2,1]
+ cond <- conditions[i]
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ index <- x < 0.01 | y < 0.01
+ x <- x[!index]
+ y <- y[!index]
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ writeLines(s, con=out)
+ next
+ }
+
+ k <- 3
+ N <- length(x)
+ em.out <- regmixEM(y,x,maxit=100, epsilon=1e-04, k=k)
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (j in seq(1,k,1)) {
+
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+
+ if (pos_r_max > 0) {
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ writeLines(s, con=out)
+ }
+ if (neg_r_max < 0) {
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ writeLines(s, con=out)
+ }
+ close(out)
+}
diff --git a/Code/create_edges0.py b/Code/create_edges0.py
new file mode 100644
index 0000000..75cb1e1
--- /dev/null
+++ b/Code/create_edges0.py
@@ -0,0 +1,215 @@
+# Usage: python create_edges0.py parameter_for_net.txt
+#
+# Make it faster by spawning subprocesses.
+#
+# Quickly create edges using all samples in TPM.txt. TF and targets
+# are from target_tf.txt. Results will be written to
+# ../Data/history/edge_pool/edges.txt.simple.correlation.all.conditions.date
+# target_tf.txt is produced by make_target_tf.py.
+#
+#
+# 26 JAN 2017, hui, slcu
+# Last modified 5 APR 2017, hui, slcu
+
+import sys, os, operator, itertools
+from datetime import datetime
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+from param4net import make_global_param_dict
+
+TARGET_FILE = '../Data/temp/all_targets.txt'
+TF_FILE = '../Data/temp/all_tfs.txt'
+RESULT_FILE = '../Data/temp/corr_all.txt'
+R_SCRIPT_FILE = '../Data/temp/compute_simple_correlation.r'
+
+HISTORY_DIR = '../Data/history/edge_pool' # edges.txt.* files are here
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2].split()
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+
+def get_targets_and_tfs(fname):
+ f = open(fname)
+ target_lst = []
+ tf_lst = []
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ target_lst.append(target)
+ tf_lst.append(tf)
+ f.close()
+ return sorted(list(set(target_lst))), sorted(list(set(tf_lst)))
+
+
+def write_lst_to_file(lst, fname):
+ f = open(fname, 'w')
+ for x in lst:
+ f.write(x + '\n')
+ f.close()
+
+
+def make_r_script(fname, result_file, data_file, target_file, tf_file, r_tau=0.75):
+
+ head = 'OUTPUT_FILE <- \'%s\'\n DATA_FILE <- \'%s\'\n TARGET_FILE <- \'%s\'\n TF_FILE <- \'%s\'\n tau <- %0.2f\n' % (result_file, data_file, target_file, tf_file, r_tau)
+
+ body = '''
+ targets <- read.table(TARGET_FILE, header=FALSE)
+ tfs <- read.table(TF_FILE, header=FALSE)
+ X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
+ targets <- as.vector(targets$V1)
+ tfs <- as.vector(tfs$V1)
+ all_genes <- rownames(X)
+
+ X <- as.matrix(X)
+ sd.1 <- apply(X, 1, sd) # sd of each row
+ s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
+ sd.tau <- (quantile(sd.1,na.rm=TRUE)[1] + quantile(sd.1,na.rm=TRUE)[2]) / 2.0 # min SD
+ good <- sd.1 > max(sd.tau, 0.05)
+ tf_good <- which( good & (all_genes %in% tfs) == T )
+ target_good <- which( good & (all_genes %in% targets) == T )
+
+ # compute correlation coefficient
+ X <- log(X + 1)
+ X[X<0.01] <- NA
+ if (length(tf_good) < 2) {
+ c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good),]), use='pairwise.complete.obs')
+ } else {
+ c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
+ }
+ index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99
+ row_names <- rownames(c)
+ col_names <- colnames(c)
+ result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index])
+
+ # write results
+ write.table(result, OUTPUT_FILE, col.names=F, row.names=F, sep='\\t', quote=F)
+ '''
+
+ f = open(fname, 'w')
+ content = head + body
+ lst = [x.strip() for x in content.split('\n')]
+ f.write('\n'.join(lst))
+ f.close()
+
+
+def edit_headline(fname):
+ ''' Remove gene_id from first line. For easier R matrix reading. '''
+ new_fname = fname + '.copy'
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ f = open(new_fname, 'w')
+ head = lines[0].strip()
+ head_lst = head.split('\t')[1:]
+ num_rnaseq = len(head.split('\t')) - 1
+ f.write('\t'.join(head_lst) + '\n')
+ for line in lines[1:]:
+ f.write(line)
+ f.close()
+ return new_fname, num_rnaseq
+
+
+def establish_edges(corr_fname, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict):
+ big_tf_dict = make_tf_dict(target_tf_fname)
+ f = open(corr_fname)
+ lines = f.readlines()
+ f.close()
+
+ result = ''
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ score = '%4.2f' % (float(lst[2]))
+ if target in big_tf_dict and tf in big_tf_dict[target]:
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf_str = tf + ' ' + get_gene_name(tf, agi2name_dict)
+ score_str = score
+ cond_str = ' '.join(big_tf_dict[target][tf])
+ curr_date = datetime.now().strftime('%Y%m%d')
+ method_or_tissue = 'all' if glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION'].strip() == '' else glb_param_dict['EXPRESSION_MATRIX_DESCRIPTION']
+ s = '\t'.join([target_str, tf_str, score_str, 'all', str(num_rnaseq), cond_str, '.', curr_date, score_str, method_or_tissue])
+ result += s + '\n'
+
+ f = open(result_fname, 'w')
+ f.write(result)
+ f.close()
+
+
+def target_tf_file_compare_same(fname1, fname2):
+ if not os.path.exists(fname1):
+ return False
+ if not os.path.exists(fname2):
+ return False
+ f1 = open(fname1)
+ s1 = f1.read()
+ f1.close()
+ f2 = open(fname2)
+ s2 = f2.read()
+ f2.close()
+ return s1 == s2
+
+
+## main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(glb_param_dict['GENE_ID_AND_GENE_NAME'])
+
+target_tf_fname = '../Data/information/target_tf.txt'
+if not os.path.exists(target_tf_fname):
+ print('create_edges0: file %s does not exist. Produce this file use make_target_tf.py.' % (target_tf_fname))
+ sys.exit()
+
+all_targets, all_tfs = get_targets_and_tfs(target_tf_fname)
+write_lst_to_file(all_targets, TARGET_FILE)
+write_lst_to_file(all_tfs, TF_FILE)
+
+data_file, num_rnaseq = edit_headline(glb_param_dict['EXPRESSION_MATRIX'])
+
+make_r_script(R_SCRIPT_FILE, RESULT_FILE, data_file, TARGET_FILE, TF_FILE, 0.60)
+
+cmd = 'Rscript %s' % (R_SCRIPT_FILE)
+os.system(cmd)
+
+if not os.path.isdir(HISTORY_DIR):
+ os.makedirs(HISTORY_DIR)
+
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+result_fname = os.path.join(HISTORY_DIR, 'edges.txt.simple.correlation.all.conditions.' + curr_time)
+establish_edges(RESULT_FILE, target_tf_fname, result_fname, agi2name_dict, num_rnaseq, glb_param_dict) # change
+
+cmd = 'rm -f %s %s %s %s %s' % (data_file, TARGET_FILE, TF_FILE, R_SCRIPT_FILE, RESULT_FILE)
+os.system(cmd)
+print('Done. Check %s.' % (result_fname))
diff --git a/Code/create_edges0B.py b/Code/create_edges0B.py
new file mode 100644
index 0000000..b09b312
--- /dev/null
+++ b/Code/create_edges0B.py
@@ -0,0 +1,217 @@
+# Usage: python create_edges0.py parameter_for_net.txt
+#
+# Purpose: tissue specific
+#
+# Quickly create edges using all samples in TPM.txt (with the same
+# tissue). tfs and targets are from target_tf.txt. Results will be
+# written to
+# ../Data/history/edges/many_targets/edges.txt.simple.correlation.tissue.date
+# target_tf.txt is produced by make_target_tf.py.
+#
+#
+# 26 JAN 2017, hui, slcu
+# Last modified 13 June 2017, hui, slcu
+# Last modified 8 Aug 2019, hui, zjnu
+
+import sys, os, operator, itertools, glob
+from datetime import datetime
+from configure import UPDATE_NETWORK_LOG_FILE
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+from param4net import make_global_param_dict
+
+TARGET_FILE = '../Data/temp/all_targets.txt'
+TF_FILE = '../Data/temp/all_tfs.txt'
+R_SCRIPT_FILE = 'correlation_per_tissue.R'
+TISSUE_INFO_FILE = '../Data/information/experiment.and.tissue.txt' # make sure this file is the same as TISSUE.FILE in R_SCRIPT_FILE
+HISTORY_DIR = '../Data/history/edges/many_targets' # edges.txt.* files are here
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2].split()
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+
+def get_targets_and_tfs(fname):
+ f = open(fname)
+ target_lst = []
+ tf_lst = []
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ target_lst.append(target)
+ tf_lst.append(tf)
+ f.close()
+ return sorted(list(set(target_lst))), sorted(list(set(tf_lst)))
+
+
+def write_lst_to_file(lst, fname):
+ f = open(fname, 'w')
+ for x in lst:
+ f.write(x + '\n')
+ f.close()
+
+
+def establish_edges(corr_fname, target_tf_fname, result_fname, agi2name_dict, tissue_dict, loglikhood_dict):
+ big_tf_dict = make_tf_dict(target_tf_fname)
+ f = open(corr_fname)
+ lines = f.readlines()
+ f.close()
+
+ result = ''
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ score = '%4.2f' % (float(lst[2]))
+ tissue = lst[3]
+ num_rnaseq_id = lst[4]
+ loglike = '-9999.0'
+ if tissue in loglikhood_dict:
+ loglik = loglikhood_dict[tissue]
+ if target in big_tf_dict and tf in big_tf_dict[target]:
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf_str = tf + ' ' + get_gene_name(tf, agi2name_dict)
+ score_str = score
+ cond_str = ' '.join(big_tf_dict[target][tf])
+ curr_date = datetime.now().strftime('%Y%m%d')
+ rnaseq_subset = '.'
+ if tissue in tissue_dict:
+ rnaseq_subset = ' '.join(list(set(tissue_dict[tissue])))
+ s = '\t'.join([target_str, tf_str, score_str, 'all', num_rnaseq_id, cond_str, loglik, curr_date, score_str, tissue])
+ result += s + '\n'
+
+ f = open(result_fname, 'w')
+ f.write(result)
+ f.close()
+
+
+def get_tissue_from_filename(s, d):
+ for k in d:
+ if k in s:
+ return k, d[k]
+ return 'unknown', '-9999.0'
+
+
+def make_tissue_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines[1:]:
+ line = line.strip()
+ if line != '':
+ lst = line.split('\t')
+ k = lst[0] # run.id
+ v = lst[4]
+ d[k] = v
+ k2 = v.split('.')[0] # broad tissue category, ignore subcategories, for example, flower.anther, only keep flower.
+ if not k2 in d:
+ d[k2] = [k]
+ else:
+ d[k2].append(k)
+
+ return d
+
+
+def target_tf_file_compare_same(fname1, fname2):
+ if not os.path.exists(fname1):
+ return False
+ if not os.path.exists(fname2):
+ return False
+ f1 = open(fname1)
+ s1 = f1.read()
+ f1.close()
+ f2 = open(fname2)
+ s2 = f2.read()
+ f2.close()
+ return s1 == s2
+
+
+########## main ##################################################
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(glb_param_dict['GENE_ID_AND_GENE_NAME'])
+
+target_tf_fname = '../Data/information/target_tf.txt'
+if not os.path.exists(target_tf_fname):
+ write_log_file('[create_edges0B.py] Critical file %s does not exists.' % (target_tf_fname), UPDATE_NETWORK_LOG_FILE)
+ sys.exit()
+
+all_targets, all_tfs = get_targets_and_tfs(target_tf_fname)
+write_lst_to_file(all_targets, TARGET_FILE)
+write_lst_to_file(all_tfs, TF_FILE)
+
+if os.path.exists(R_SCRIPT_FILE):
+ cmd = 'Rscript %s' % (R_SCRIPT_FILE)
+ os.system(cmd)
+else:
+ sys.exit()
+
+loglikhood_dict = {
+ 'seedling':'-999.0',
+ 'meristem':'-998.0',
+ 'root':'-997.0',
+ 'leaf':'-996.0',
+ 'flower':'-995.0',
+ 'shoot':'-994.0',
+ 'seed':'-993.0',
+ 'stem':'-992.0',
+ 'aerial':'-990.0'
+}
+
+if not os.path.isdir(HISTORY_DIR):
+ os.makedirs(HISTORY_DIR)
+
+
+file_lst = glob.glob('../Data/temp/edges.txt.simple.correlation.tissue.*.txt')
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+if os.path.exists(TISSUE_INFO_FILE):
+ tissue_dict = make_tissue_dict(TISSUE_INFO_FILE) # assign each rnaseq a tissue, and each tissue a list of rnaseq
+else:
+ sys.exit()
+
+for fname in file_lst:
+ tissue, loglik_placeholder = get_tissue_from_filename(fname, loglikhood_dict)
+ if tissue != 'unknown':
+ print(fname)
+ result_fname = os.path.join(HISTORY_DIR, 'edges.txt.simple.correlation.%s.%s' % (tissue, curr_time))
+ RESULT_FILE = fname
+ establish_edges(RESULT_FILE, target_tf_fname, result_fname, agi2name_dict, tissue_dict, loglikhood_dict) # change
+
+cmd = 'rm -f %s %s' % (TARGET_FILE, TF_FILE)
+os.system(cmd)
+cmd = 'rm -f ../Data/temp/edges.txt.simple.correlation.tissue.*.txt'
+os.system(cmd)
+#print('Done.')
diff --git a/Code/create_edges3.py b/Code/create_edges3.py
new file mode 100644
index 0000000..3cd5ba4
--- /dev/null
+++ b/Code/create_edges3.py
@@ -0,0 +1,615 @@
+# Usage: python create_edges3.py parameter_for_net.txt
+#
+# Make it faster by spawning subprocesses.
+#
+# Make it smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
+# So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 19 JAN 2017, hui, slcu
+
+import sys, os, operator, itertools
+import numpy as np
+import scipy.stats as stat
+from datetime import datetime
+
+import rpy2.robjects as r
+from rpy2.robjects.packages import importr
+from rpy2.robjects import FloatVector
+
+import warnings
+
+import multiprocessing
+import time
+
+import json
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict, get_gene_name
+
+EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+MAX_NUM_PROCESS = 20
+
+
+
+####################################
+DATA_SYMBOL = '@'
+TOP_N_TF = 50
+MIN_NUM_CONDITION = 20
+SIGNAL_INPUT_RATIO_TAU = 1.5
+REMOVE_HORIZONTAL_STRIP_TAU = 0.01
+REMOVE_VERTICAL_STRIP_TAU = 0.01
+MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30
+####################################
+
+def get_three_components_mixtools(y, x, cond_lst):
+
+ # Remove NaNs or Infs
+ warn_msg = ''
+ sz = len(x)
+ if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ index = np.isfinite(x) & np.isfinite(y)
+ if sum(index) < sz:
+ warn_msg = 'HAS_NAN_OR_INIFNITE'
+ if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+ xx = np.array(x[index])
+ yy = np.array(y[index])
+ cond_lst2 = np.array(cond_lst)
+ cond_lst2 = cond_lst2[index]
+
+ # Train the model
+ mixtools = importr('mixtools')
+ try:
+ result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100)
+ except:
+ return None, None, None, None, None, None, None, None, None, 'IGNORE'
+ posterior = result[result.names.index('posterior')]
+ posterior = np.array(posterior)
+ l = np.argmax(posterior, axis=1) # class information
+ idx1 = l == 0
+ idx2 = l == 1
+ idx3 = l == 2
+ warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0])
+
+ return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def read_experiment_id(fname):
+ ''' Read column names (RNA-seq ids) and row names (gene ids) from TPM.txt '''
+
+ colid = []
+ rowid = []
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+ rowid = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+
+ return (colid, rowid)
+
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+
+def read_info_data(fname):
+ ''' Read chip-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_related_condition(s, info_dict):
+ lst = s.split(';')
+ result = [] # a list of sample IDs
+ result = info_dict['ID_LIST'] # TBD
+ return result
+
+def update_global_param_dict(glb_param_dict, info_dict):
+ if glb_param_dict['RESEARCH_KEYWORDS'] == '':
+ glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST']
+ glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict)
+
+
+def float_equal(x, y):
+ return np.abs(x-y) < 0.001
+
+
+def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ ''' get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored. '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ clst = [None]*num_cond
+ d = expr_dict['xy']
+ j = 0
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c))
+ #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[j] = np.log(x+1)
+ elst2[j] = np.log(y+1)
+ else:
+ elst1[j] = x
+ elst2[j] = y
+ clst[j] = c
+ j += 1
+ return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] )
+
+
+def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+ '''
+ get gene expression for two genes. Conditions in which two genes have zero TPM values are ignored.
+ In addition, vertical strip (as seen in their scatter plot) and horizontal strip are removed.
+ '''
+ num_cond = len(cond_lst)
+ elst1 = [None]*num_cond
+ elst2 = [None]*num_cond
+ mark_cond = [True]*num_cond # indicate if a condition should be included
+ clst = []
+ d = expr_dict['xy']
+
+ for i in range(num_cond):
+ c = cond_lst[i]
+ x = expr_dict['xy'][gene_id1][c]
+ y = expr_dict['xy'][gene_id2][c]
+ if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+ if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+ elst1[i] = np.log(x+1)
+ elst2[i] = np.log(y+1)
+ else:
+ elst1[i] = x
+ elst2[i] = y
+ if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip
+ mark_cond[i] = False
+ else:
+ clst.append(c)
+ else:
+ mark_cond[i] = False
+
+ a = np.array(elst1)
+ a = a[mark_cond]
+ b = np.array(elst2)
+ b = b[mark_cond]
+ return (a.astype(np.float64), b.astype(np.float64), clst)
+
+
+#################### select condition stuff ###############
+
+def common_elements(list1, list2):
+ return sorted(list(set(list1).intersection(list2)))
+
+def correlation_is_significant(r, p, glb_param_dict):
+ return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF'])
+
+def subset_list(lst, bool_index):
+ if len(lst) != len(bool_index):
+ print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index))
+ sys.exit()
+
+ n = len(lst)
+ result = []
+ for i in range(n):
+ if bool_index[i] == True:
+ result.append(lst[i])
+ return result
+
+
+def write_dict(d_lst, glb_param_dict, fname):
+ ''' write a list of dictionary content to fname. Each dictionary contains a target-TF edge. '''
+ agi2name_dict = glb_param_dict['name_conversion_dict']
+ curr_date = datetime.now().strftime('%Y%m%d_%H%M%S') # add date to the end of each line, for future reference or filtering
+ fname = os.path.join(EDGE_DIR, fname)
+ f = open(fname, 'w')
+ for d in d_lst:
+ if d['type'] == 'two-way':
+ gene_id = d['target']
+ head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ gene_id = d['TF']
+ head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+ d2 = d['significant']
+ if 'all' in d2:
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.')
+ s += '\t' + curr_date
+ f.write(head + s + '\n')
+ if 'mix' in d2:
+ n = len(d2['mix']['score'])
+ for i in range(n):
+ s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i])
+ s += '\t' + curr_date
+ f.write(head + s + '\n')
+ f.close()
+
+
+def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict):
+ '''
+
+ Check if target has relationship with each of TFs.
+
+ tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST)
+
+ Return a list of dictionaries. Each dictionary has the following format:
+
+ 'type' :'two-way'
+ 'target':''
+ 'TF' :''
+ 'significant': {
+ 'all': {'signal_set':[], score=2, chip_id:''}
+ 'pos_direction':{'signal_set':[], score:.0, chip_id:''}
+ 'neg_direction':{'signal_set':[], score:.1, chip_id:''}
+ 'user_defined': {'signal_set':[], score:.3, chip_id:''}
+ 'mix': {'signal_set':[], score:[.7,-.5], chip_id:''}
+ }
+
+ '''
+
+ result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair
+
+ target_gene_id = target
+ all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples. TBD, can be glb_param_dict['USER_CONDITION_LIST']
+ logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values
+
+ if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything
+ return None
+
+ for tf_gene_id in sorted(tf_dict.keys()): # y is a TF gene id
+
+ chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000
+
+ if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything
+ continue
+
+ # get gene expression profiles for target and TF. If in a RNA-seq sample, both target and TF is 0, then this sample is ignored.
+ target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst, tf_elst)
+
+ d = {}
+ d['target'] = target_gene_id
+ d['TF'] = tf_gene_id
+ d['type'] = 'two-way'
+ d['significant'] = {}
+
+ all_good = False
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['all'] = {}
+ d['significant']['all']['signal_set'] = clist # a list of sample IDs, returned by get_gene_expression3
+ d['significant']['all']['score'] = r
+ d['significant']['all']['chip_id'] = chip_id
+ all_good = True
+
+ user_cond_lst = glb_param_dict['USER_CONDITION_LIST']
+ if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []:
+ target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize)
+
+ r, p = stat.pearsonr(target_elst_user, tf_elst_user)
+ if correlation_is_significant(r, p, glb_param_dict):
+ d['significant']['user'] = {}
+ d['significant']['user']['signal_set'] = user_cond_lst
+ d['significant']['user']['score'] = r
+ d['significant']['user']['chip_id'] = chip_id
+
+ # obsolete
+ max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF']
+ if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES':
+ aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos')
+ r_pos, p_pos = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION:
+ d['significant']['pos'] = {}
+ d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos)
+ d['significant']['pos']['score'] = r_pos
+ d['significant']['pos']['chip_id'] = chip_id
+
+ # obsolete
+ if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES':
+ aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg')
+ r_neg, p_neg = stat.pearsonr(aa, bb)
+ if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION:
+ d['significant']['neg'] = {}
+ d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg)
+ d['significant']['neg']['score'] = r_neg
+ d['significant']['neg']['chip_id'] = chip_id
+
+ K = int(glb_param_dict['NUMBER_OF_COMPONENTS'])
+ if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results
+ if K == 2: # for now consider two components
+ #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst)))
+ #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist)))
+ index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ aa = target_elst[index1]
+ bb = tf_elst[index1]
+ r_mix1, p_mix1 = stat.pearsonr(aa, bb)
+ aa = target_elst[index2]
+ bb = tf_elst[index2]
+ r_mix2, p_mix2 = stat.pearsonr(aa, bb)
+ #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2))
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict)
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict)
+ if flag1 or flag2:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ if flag1:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index1))
+ d['significant']['mix']['score'].append(r_mix1)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(subset_list(clist, index2))
+ d['significant']['mix']['score'].append(r_mix2)
+
+ if K == 3: # three components
+ aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components
+ if msg != 'IGNORE':
+ r_mix1, p_mix1 = stat.pearsonr(aa1, bb1)
+ r_mix2, p_mix2 = stat.pearsonr(aa2, bb2)
+ r_mix3, p_mix3 = stat.pearsonr(aa3, bb3)
+ #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id))
+ #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1))
+ #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2))
+ #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3))
+ #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3)))
+ min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS'])
+ flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points
+ flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points
+ flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points
+ if flag1 or flag2 or flag3:
+ d['significant']['mix'] = {}
+ d['significant']['mix']['signal_set'] = []
+ d['significant']['mix']['score'] = []
+ d['significant']['mix']['chip_id'] = chip_id
+ d['significant']['mix']['message'] = []
+ if flag1:
+ d['significant']['mix']['signal_set'].append(cond1)
+ d['significant']['mix']['score'].append(r_mix1)
+ d['significant']['mix']['message'].append(msg)
+ if flag2:
+ d['significant']['mix']['signal_set'].append(cond2)
+ d['significant']['mix']['score'].append(r_mix2)
+ d['significant']['mix']['message'].append(msg)
+ if flag3:
+ d['significant']['mix']['signal_set'].append(cond3)
+ d['significant']['mix']['score'].append(r_mix3)
+ d['significant']['mix']['message'].append(msg)
+
+ if len(d['significant']) > 0: # significant edges exist
+ result_dict_lst.append(d)
+
+
+ curr_time = datetime.now().strftime('%Y%m%d')
+ fname = 'edges.txt' + '.' + curr_time + '.' + target_gene_id
+ if result_dict_lst != []:
+ write_dict(result_dict_lst, glb_param_dict, fname)
+
+
+def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict):
+ ''' TBD '''
+ return []
+
+
+def make_small_expr_dict(lst, colid, rowid, dirname):
+ d_return = {}
+ d = {}
+ for g in lst:
+ fname = os.path.join(dirname, g + '.json')
+ if os.path.exists(fname):
+ with open(fname) as f:
+ d[g] = json.load(f)
+
+ d_return['colid'] = colid
+ d_return['rowid'] = rowid
+ d_return['xy'] = d
+
+ return d_return
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2].split()
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+
+def establish_edges(jsonTPM_dir, expr_info_dict, rna_experiment_ids, rna_gene_ids, d, bind_info_dict, glb_param_dict):
+ ''' d - binding dictionary {target:{tf:[c1,c2], tf:[c2,c3]}, ... } '''
+
+ gene_lst = get_gene_list(glb_param_dict['GENE_LIST'])
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes
+ update_global_param_dict(glb_param_dict, expr_info_dict)
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ process_lst = [] # a list of processes
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ for g in final_gene_lst: # high priority genes are in the front
+ if g in d: # target g is in binding dictionary d
+ tf_dict = d[g]
+ if len(tf_dict) > 0: # it has TFs, usually it is the case
+ if glb_param_dict['TWO_WAY'] == 'YES':
+ small_gene_lst = tf_dict.keys()
+ small_gene_lst.append(g)
+ expr_dict = make_small_expr_dict(small_gene_lst, rna_experiment_ids, rna_gene_ids, jsonTPM_dir)
+ p = multiprocessing.Process(target=two_way, args=(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict))
+ process_lst.append(p)
+ p.start()
+ if len(process_lst) > MAX_NUM_PROCESS:
+ temp_lst = []
+ for p in process_lst:
+ p.join(timeout=60) # wait for subprocess to finish, wait for 60 secs, if it has not finished yet, check next subprocess
+ if p.is_alive():
+ temp_lst.append(p)
+ process_lst = temp_lst
+
+
+########## main ##################################################
+r.r['options'](warn=-1) # supress warning message from rpy2
+warnings.filterwarnings("ignore")
+
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+glb_param_dict['name_conversion_dict'] = agi2name_dict
+
+#print('Read expression data')
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory
+os.system(cmd)
+curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time)
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO'])
+colid, rowid = read_experiment_id(glb_param_dict['EXPRESSION_MATRIX'])
+
+#print('Read binding data')
+cmd = 'python make_target_tf.py %s' % (param_file) # make target_tf.txt
+os.system(cmd) # change
+curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+target_tf_fname = 'target_tf.txt.' + curr_time
+cmd = 'cp target_tf.txt %s' % (target_tf_fname)
+os.system(cmd)
+big_tf_dict = make_tf_dict(target_tf_fname)
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+
+#print('Establish edges')
+establish_edges(JSON_DIR, expr_info_dict, colid, rowid, big_tf_dict, bind_info_dict, glb_param_dict)
diff --git a/Code/create_edges4.py b/Code/create_edges4.py
new file mode 100644
index 0000000..29c87ff
--- /dev/null
+++ b/Code/create_edges4.py
@@ -0,0 +1,450 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate a few WORK20170328_1026_AGI_one_K2/3.R scripts, each for a target gene. Treat each target separately and at the same, to speed things up.
+# The results will be saved as edges.txt.AT2G40300.Apr.04.2017.11:45:30.k3, where AT3G12580 is gene id, and k3 means K=3 in Mixture of Regressions.
+# The edges.txt files will be merged together later by update_network.py.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files (in JSON_DIR), and converting binding.txt to target_tf.txt (in target_tf_fname) first.
+# So we only load necessary gene expression vectors each time. So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 7 Mar 2017, slcu, hui
+# Last modified 23 Mar 2017, slcu, hui
+# Last modified 23 Mar 2017, slcu, hui. Check edges.txt to determine update.
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt' # recent, merged edges from various sources
+EDGE_DIR = '../Data/history/edges/one_target' # a directory for storing all edge files generated by this script, one for each target gene
+TIME_INTERVAL = 2 # wait 5 seconds before launching a R Rscript
+MAX_PROCESS = 10 # CHANGE this to a larger number if you have many CPUs
+AVERAGE_LIKELIHOOD_TAU = -0.5 # a value betweeo -0.1 to -998. must be negative, lower this value make less effort in creating brand-new edges.
+EDGE_AGE = 30 # if an edge's age is less than 30 days, don't need to update it.
+
+####################################
+DATA_SYMBOL = '@'
+####################################
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def get_ordered_gene_list(fname):
+ gene_list = get_gene_list(fname)
+ d = {}
+ if not os.path.exists(EDGE_FILE):
+ return gene_list
+ f = open(EDGE_FILE)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0]
+ tf = lst[1].split()[0]
+ if not target in d:
+ d[target] = 1
+ else:
+ d[target] += 1
+
+ result_gene_lst = []
+ for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+ g = t[0]
+ if g in gene_list:
+ result_gene_lst.append(g)
+ return result_gene_lst
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+def make_target_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not tf in d:
+ d[tf] = {target:cond}
+ else:
+ d[tf][target] = cond
+ f.close()
+ return d
+
+def not_bad_line(s):
+ if s.strip() == '':
+ return False
+ if 'WARNING' in s:
+ return False
+ if 'number' in s:
+ return False
+ if 'Need' in s:
+ return False
+ if 'Error' in s:
+ return False
+ if 'Too' in s:
+ return False
+ if not s.startswith('AT'):
+ return False
+ return True
+
+def make_edge_dict(fname):
+ d = {}
+ if not os.path.exists(fname):
+ return d
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not_bad_line(line):
+ lst = line.split('\t')
+ target_id = lst[0].split()[0]
+ tf_id = lst[1].split()[0]
+ date = '20161201'
+ num_rcond = len(lst[4].split())
+ avg_loglik = -999 # very low likelihood
+ loglik = lst[6]
+ if loglik != '.':
+ if '=' in loglik:
+ avg_loglik = float(loglik.split('=')[1])/num_rcond
+ else:
+ avg_loglik = float(loglik)/num_rcond
+ if len(lst) == 8:
+ date = lst[7]
+ if not tf_id in d:
+ d[tf_id] = {target_id:{'date':date, 'loglikelihood':avg_loglik}}
+ else:
+ d[tf_id][target_id] = {'date':date, 'loglikelihood':avg_loglik}
+ f.close()
+ return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component, edge_dict):
+ head = 'k.lst <- c(%d)\n' % (num_component)
+ head += 'target <- \'%s\'\n' % (target)
+ head += 'id2 <- target\n'
+ tfs = ''
+ conds = ''
+ recent_edge = ''
+ curr_time = datetime.now().strftime('%Y%m%d')
+ for k in tf_dict.keys(): # k is tf
+ tfs += '\'%s\',' % (k)
+ conds += '\'%s\',' % (tf_dict[k])
+ if k in edge_dict and target in edge_dict[k] and (int(curr_time) - int(edge_dict[k][target]['date']) <= EDGE_AGE or edge_dict[k][target]['loglikelihood'] >= AVERAGE_LIKELIHOOD_TAU): # recent and good
+ recent_edge += '%d,' % (1)
+ else:
+ recent_edge += '%d,' % (0)
+
+ head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+ head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+ head += 'recent.edge <- c(' + recent_edge.rstrip(',') + ')\n'
+ head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+ head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+ body = '''
+ post.translation <- function(x, y) {
+ mean.x <- mean(x)
+ sd.x <- sd(x)
+ index <- x > mean.x - sd.x & x < mean.x + sd.x
+ sd.y <- sd(y[index])
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ post.translation.2 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ sd.x <- max(sd(x), 1) # a number above 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+ sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+ # value measure dispersion of y and percent of points within a window
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ post.translation.3 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ upper.percentile <- 0.85 # used for computing vertical dispersion
+ lowest.n <- 3 # number of points with lowest x values
+ min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+ sd.x <- min(sd(x), 1) # a number between 0 and 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+ sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+
+ rst <- sort(x, index.return=T)
+ top.n <- sum(rst$x < 1)
+ top.n <- max(1, min(top.n, lowest.n))
+ small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+ small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+ # value measure dispersion of y and percent of points within a window
+ result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ in.component <- function(posterior, k) {
+ # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+ p = posterior[,k]
+ n = length(p)
+ index <- rep(F,n)
+ for (i in 1:n) {
+ if (p[i] > runif(1)) {
+ index[i] = T
+ }
+ }
+ result <- index
+ }
+
+ ####### Read data #########################################
+ CORR_THRESHOLD <- 0.7
+ agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+ #######################################################
+ library(mixtools)
+ library(rjson)
+ name2 <- agi[id2,1]
+ result <- ''
+ for (i in 1:length(tfs)) {
+ if (recent.edge[i] == 1) {
+ next
+ }
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+
+ if (length(x) < MIN_SIZE) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ #s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ #result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- in.component(em.out$posterior, j)
+ size <- sum(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+ hit <- 0
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (hit == 0) {
+ t <- post.translation.3(x, y)
+ post.r <- t$percent
+ if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+ sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+ }
+ '''
+ tail = '\n'
+ tail += 'output.file <- paste(\'%s/edges.txt\', id2, format(Sys.time(), \'%%b.%%d.%%Y.%%X\'), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+ tail += 'if (result != \'\') cat(result, file=output.file, sep=\'\')\n'
+ f = open(fname, 'w')
+ content = head + body + tail
+ f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+ f.close()
+ return fname
+
+def number_of_running_process(lst):
+ ''' get number of running processes (with CPU usage greater than 0) '''
+ count = 0
+ for x in lst:
+ x = x.strip()
+ if x != '':
+ count += 1 if x.split()[2] > '0.0' else 0 # CPU usage great than 0.0
+ return count
+
+def wait_a_moment(n, prefix):
+ ''' if there are more than n work_on...R scripts running, wait... '''
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE
+ num_proc = number_of_running_process(ps.communicate()[0].split('\n'))
+ while (num_proc > n):
+ #print('number of running processes %d' % (len(process_lst)))
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+ process_lst = ps.communicate()[0].split('\n')
+ num_proc = number_of_running_process(process_lst)
+
+def establish_edges(jsonTPM_dir, d, d2, glb_param_dict, rprefix, edge_dict):
+ '''
+ jsonTPM_dir -- contain gene expression json files, one for each gene
+ d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions
+ d2 - binding dictionary {tf:{target1:c1, target2:c2}, ...}, c1 c2 are strings of conditions
+ '''
+
+ gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ # make a list of targets, putting high-priority target in the beginning
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ # process each target. First consider all TFs of the target (if any). then consider the target's targets (if any).
+ for target in final_gene_lst: # high priority genes are processed first
+ if target in d: # target g is in binding dictionary d
+ tf_dict = d[target] # a dictionary of upstream genes, in the form of {tf1:c1, tf2:c2}
+ if target in d2:
+ target_dict = d2[target] # a dictionary downstream genes, in the form of {target1:c1, target2:c2}
+ else:
+ target_dict = {}
+
+ if len(tf_dict) > 0: # the target has TFs (upstream genes)
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 2) # k is 2
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 2, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 3) # k is 3
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 3, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+ if len(target_dict) > 0: # the target has targets
+ count = 0
+ for k in target_dict:
+ successor = k # successos is target's target
+ tf_dict2 = {target:target_dict[k]} # now target becomes TF, and its successor becomes targets
+ r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 2) # k is 2, one means consider one edge each time.
+ fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 2, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd)
+ r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 3) # k is 3
+ fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 3, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd)
+ count = count + 1
+ if count % MAX_PROCESS == 0:
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt, in Data/parameter/
+glb_param_dict = make_global_param_dict(param_file)
+GENE_ID_TO_GENE_NAME = glb_param_dict['GENE_ID_AND_GENE_NAME']
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)# for gene names
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+#print('Make target tf using binding.txt')
+#if os.path.exists('../Data/information/target_tf.txt'):
+# cmd = 'cp ../Data/information/target_tf.txt ../Data/information/target_tf.txt.%s' % (curr_time)
+# os.system(cmd)
+
+target_tf_fname = '../Data/information/target_tf.txt.%s' % (curr_time)
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+
+#print('Make jsonTPM ...') # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170424_154323'
+#target_tf_fname = '../Data/information/target_tf.txt.20170424_154323'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname) # key is target
+big_target_dict = make_target_dict(target_tf_fname) # key is tf
+rscript_prefix = 'Work' + datetime.now().strftime('%Y%m%d%H%M') # each R script's name starts with WORK followed by time
+edge_dict = make_edge_dict(EDGE_FILE)
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, big_target_dict, glb_param_dict, rscript_prefix, edge_dict)
diff --git a/Code/create_edges4_k2.py b/Code/create_edges4_k2.py
new file mode 100644
index 0000000..cffd160
--- /dev/null
+++ b/Code/create_edges4_k2.py
@@ -0,0 +1,253 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate MAX_PROCESS work_on_AGI#.R scripts, each for a target gene. So treat each target separately.
+# The results will be saved as edges.txt.AT3G12580.20170308, where AT3G12580 is target gene id, and 20170308 is date.
+# The edges.txt files will be merged together later.
+# Hopeful it will be faster.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
+# So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 7 Mar 2017, slcu, hui
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt'
+EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' # for gene names
+TIME_INTERVAL = 10 # wait this long in seconds between before launching a R Rscript
+MAX_PROCESS = 5 # CHANGE
+K = 2 # CHANGE number of components to use
+
+####################################
+DATA_SYMBOL = '@'
+####################################
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def get_ordered_gene_list(fname):
+ gene_list = get_gene_list(fname)
+ d = {}
+ f = open(EDGE_FILE)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0]
+ tf = lst[1].split()[0]
+ if not target in d:
+ d[target] = 1
+ else:
+ d[target] += 1
+
+ result_gene_lst = []
+ for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+ g = t[0]
+ if g in gene_list:
+ result_gene_lst.append(g)
+ return result_gene_lst
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component):
+ head = 'k.lst <- c(%d)\n' % (num_component)
+ head += 'target <- \'%s\'\n' % (target)
+ head += 'id2 <- target\n'
+ tfs = ''
+ conds = ''
+ for k in tf_dict.keys():
+ tfs += '\'%s\',' % (k)
+ conds += '\'%s\',' % (tf_dict[k])
+ head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+ head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+ head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+ head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+ head += 'output.file <- paste(\'%s/edges.txt\', id2, gsub(\'-\',\'\',Sys.Date()), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+ body = '''
+ ####### Read data #########################################
+ CORR_THRESHOLD <- 0.7
+ MIN_SIZE <- 100
+ agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+ #######################################################
+ library(mixtools)
+ library(rjson)
+ name2 <- agi[id2,1]
+ result <- ''
+ for (i in 1:length(tfs)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # important to make the index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+ cat(result, file=output.file, sep='')
+ '''
+ f = open(fname, 'w')
+ content = head + body
+ f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+ f.close()
+ return fname
+
+def wait_a_moment(n, prefix):
+ ''' if there are more than n work_on...R scripts running, wait... '''
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE
+ process_lst = ps.communicate()[0].split('\n')
+ while (len(process_lst) > n):
+ #print('number of running processes %d' % (len(process_lst)))
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+ process_lst = ps.communicate()[0].split('\n')
+
+def establish_edges(jsonTPM_dir, d, glb_param_dict, rprefix):
+ ''' d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions '''
+
+ gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ # make a list of targets, putting high-priority target in the beginning
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ # process each target
+ for target in final_gene_lst: # high priority genes are processed first
+ if target in d: # target g is in binding dictionary d
+ tf_dict = d[target] # in the form of {tf1:c1, tf2:c2}
+ if len(tf_dict) > 0: # it has TFs, usually it is the case
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, K)
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, K)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) # for gene names
+
+#print('Make jsonTPM ...') # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+#print('Make target tf using binding.txt')
+target_tf_fname = '../Data/information/target_tf.txt.' + curr_time
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170310_1153'
+#target_tf_fname = '../Data/information/target_tf.txt.20170310_1153'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname)
+rscript_prefix = 'WORK%s' % (datetime.now().strftime('%Y%m%d_%H%M'))
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, glb_param_dict, rscript_prefix)
diff --git a/Code/create_edges4_k3.py b/Code/create_edges4_k3.py
new file mode 100644
index 0000000..67f720b
--- /dev/null
+++ b/Code/create_edges4_k3.py
@@ -0,0 +1,255 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate MAX_PROCESS work_on_AGI#.R scripts, each for a target gene. So treat each target separately.
+# The results will be saved as edges.txt.AT3G12580.20170308, where AT3G12580 is target gene id, and 20170308 is date.
+# The edges.txt files will be merged together later.
+# Hopeful it will be faster.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
+# So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 7 Mar 2017, slcu, hui
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt'
+EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' # for gene names
+TIME_INTERVAL = 10 # wait this long in seconds between before launching a R Rscript
+MAX_PROCESS = 5 # CHANGE
+K = 3 # CHANGE number of components to use
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+####################################
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_ordered_gene_list(fname):
+ gene_list = get_gene_list(fname)
+ d = {}
+ f = open(EDGE_FILE)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0]
+ tf = lst[1].split()[0]
+ if not target in d:
+ d[target] = 1
+ else:
+ d[target] += 1
+
+ result_gene_lst = []
+ for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+ g = t[0]
+ if g in gene_list:
+ result_gene_lst.append(g)
+ return result_gene_lst
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component):
+ head = 'k.lst <- c(%d)\n' % (num_component)
+ head += 'target <- \'%s\'\n' % (target)
+ head += 'id2 <- target\n'
+ tfs = ''
+ conds = ''
+ for k in tf_dict.keys():
+ tfs += '\'%s\',' % (k)
+ conds += '\'%s\',' % (tf_dict[k])
+ head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+ head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+ head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+ head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+ head += 'output.file <- paste(\'%s/edges.txt\', id2, gsub(\'-\',\'\',Sys.Date()), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+ body = '''
+ ####### Read data #########################################
+ CORR_THRESHOLD <- 0.7
+ MIN_SIZE <- 100
+ agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+ #######################################################
+ library(mixtools)
+ library(rjson)
+ name2 <- agi[id2,1]
+ result <- ''
+ for (i in 1:length(tfs)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # important to make the index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+ cat(result, file=output.file, sep='')
+ '''
+ f = open(fname, 'w')
+ content = head + body
+ f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+ f.close()
+ return fname
+
+def wait_a_moment(n, prefix):
+ ''' if there are more than n work_on...R scripts running, wait... '''
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE
+ process_lst = ps.communicate()[0].split('\n')
+ while (len(process_lst) > n):
+ #print('number of running processes %d' % (len(process_lst)))
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+ process_lst = ps.communicate()[0].split('\n')
+
+def establish_edges(jsonTPM_dir, d, glb_param_dict, rprefix):
+ ''' d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions '''
+
+ gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ # make a list of targets, putting high-priority target in the beginning
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ # process each target
+ for target in final_gene_lst: # high priority genes are processed first
+ if target in d: # target g is in binding dictionary d
+ tf_dict = d[target] # in the form of {tf1:c1, tf2:c2}
+ if len(tf_dict) > 0: # it has TFs, usually it is the case
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, K)
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, K)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) # for gene names
+
+#print('Make jsonTPM ...') # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+#print('Make target tf using binding.txt')
+target_tf_fname = '../Data/information/target_tf.txt.' + curr_time
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170310_1153'
+#target_tf_fname = '../Data/information/target_tf.txt.20170310_1153'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname)
+rscript_prefix = 'WORK%s' % (datetime.now().strftime('%Y%m%d_%H%M'))
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, glb_param_dict, rscript_prefix)
diff --git a/Code/create_edges_k2.R b/Code/create_edges_k2.R
new file mode 100644
index 0000000..a044ed0
--- /dev/null
+++ b/Code/create_edges_k2.R
@@ -0,0 +1,136 @@
+# Last modified 13 August 2019
+
+TARGET_TF_FILE <- "../Data/information/target_tf.txt"
+DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table
+AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt"
+CORR_THRESHOLD <- 0.5
+MIN_SIZE <- 100
+
+
+# Make sure we have required files
+if (! file.exists(TARGET_TF_FILE)) {
+ stop(sprintf('[create_edges2_k2.R] Unable to find %s', TARGET_TF_FILE))
+}
+
+if (! file.exists(DATA_FILE)) {
+ stop(sprintf('[create_edges_k2.R] Unable to find %s', DATA_FILE))
+}
+
+if (! file.exists(AGINAME_FILE)) {
+ stop(sprintf('[create_edges_k2.R] Unable to find %s', AGINAME_FILE))
+}
+
+
+####### Read data #########################################
+X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
+gene_id <- X$gene_id
+X$gene_id <- NULL
+row.names(X) <- gene_id
+X <- as.matrix(X)
+rna.sample.id <- colnames(X)
+
+target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
+target_tf <- as.matrix(target_tf)
+targets <- target_tf[,1]
+tfs <- target_tf[,2]
+conditions <- target_tf[,3]
+
+agi <- read.table(AGINAME_FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+
+library(mixtools)
+options(max.print=999999999)
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'mixtools', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:length(targets)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ id2 <- targets[i]
+ if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
+ next
+ }
+
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+
+ cond <- conditions[i]
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ index <- x < 0.01 | y < 0.01
+ x <- x[!index]
+ y <- y[!index]
+ if (length(x) < 3 | sd(x) < 0.1 | sd(y) < 0.1 ) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #flush.console()
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ next
+ }
+
+ k <- 2
+ N <- length(x)
+ tryCatch( em.out <- regmixEM(y, x, maxit=100, epsilon=1e-03, k=k), error=function(e) NULL )
+ if (length(em.out) == 0) { # if there is an error when running regmixEM, we skip.
+ next
+ }
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (j in seq(1,k,1)) {
+
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+
+ if (pos_r_max > 0) {
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[pos_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', num.sub.cond, cond, pos_r_loglik, curr.date, pos_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+
+ if (neg_r_max < 0) {
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[neg_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', num.sub.cond, cond, neg_r_loglik, curr.date, neg_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+}
+
+close(f) \ No newline at end of file
diff --git a/Code/create_edges_mixtool.R b/Code/create_edges_mixtool.R
new file mode 100644
index 0000000..6ed852b
--- /dev/null
+++ b/Code/create_edges_mixtool.R
@@ -0,0 +1,154 @@
+# Last modified 13 August 2019
+
+TARGET_TF_FILE <- "../Data/information/target_tf.txt"
+DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table
+AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt"
+CORR_THRESHOLD <- 0.5
+MIN_SIZE <- 100
+
+
+# Make sure we have required files
+if (! file.exists(TARGET_TF_FILE)) {
+ stop(sprintf('[create_edges_mixtool.R] Unable to find %s', TARGET_TF_FILE))
+}
+
+if (! file.exists(DATA_FILE)) {
+ stop(sprintf('[create_edges_mixtool.R] Unable to find %s', DATA_FILE))
+}
+
+if (! file.exists(AGINAME_FILE)) {
+ stop(sprintf('[create_edges_mixtool.R] Unable to find %s', AGINAME_FILE))
+}
+
+
+###### get commandline arguments #########################
+args <- commandArgs(trailingOnly=TRUE)
+#k <- strtoi(args[1])
+k <- 2
+
+####### Read data #########################################
+X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
+gene_id <- X$gene_id
+X$gene_id <- NULL
+row.names(X) <- gene_id
+X <- as.matrix(X)
+rna.sample.id <- colnames(X)
+
+target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
+target_tf <- as.matrix(target_tf)
+#target_tf <- target_tf[nrow(target_tf):1,] # reverse rows
+targets <- target_tf[,1]
+tfs <- target_tf[,2]
+conditions <- target_tf[,3]
+
+agi <- read.table(AGINAME_FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+
+library(mixtools)
+options(max.print=999999999)
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'mixtools', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
+f <- file(output.file, 'w')
+
+
+for (i in 1:length(targets)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ id2 <- targets[i]
+ if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
+ next
+ }
+
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+
+ cond <- conditions[i]
+ x <- X[id1,]
+ y <- X[id2,]
+
+ na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y))
+ if (na.ratio > 0.5) {
+ next
+ }
+
+ x <- log(x+1)
+ y <- log(y+1)
+ sdx <- sd(x)
+ sdy <- sd(y)
+ index <- x < 0.01 | y < 0.01
+ x <- x[!index]
+ y <- y[!index]
+ if (length(x) < 20 || sdx < 0.25 || sdy < 0.5 ) {
+ print(i)
+ next
+ }
+
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #flush.console()
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ next
+ }
+
+
+ N <- length(x)
+ tryCatch( em.out <- regmixEM(y, x, maxit=100, epsilon=1e-03, k=k), error=function(e) NULL )
+ if (length(em.out) == 0) { # if there is an error when running regmixEM, we skip.
+ next
+ }
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (j in seq(1,k,1)) {
+
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+
+ if (pos_r_max > 0 && pos_r_loglik > -3000) {
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[pos_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', num.sub.cond, cond, pos_r_loglik, curr.date, pos_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+
+ if (neg_r_max < 0 && neg_r_loglik > -3000) {
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[neg_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', num.sub.cond, cond, neg_r_loglik, curr.date, neg_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+ cat('', file=f, sep='')
+}
+
+close(f)
diff --git a/Code/degree_of_separation.py b/Code/degree_of_separation.py
new file mode 100644
index 0000000..b7eba92
--- /dev/null
+++ b/Code/degree_of_separation.py
@@ -0,0 +1,43 @@
+# Usage: python degree_of_deparation.py edges.txt
+# Purpose: get the maximum degree of separation
+import os, sys
+import networkx as nx
+#import util_networkx
+from networkx.algorithms.distance_measures import diameter
+
+def build_network_from_file(edge_fname):
+ G = nx.DiGraph()
+ source_nodes = []
+ f = open(edge_fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '':
+ strength = float(lst[8])
+ method_or_tissue = lst[9]
+ g1 = lst[0].split()[0] # target gene ID
+ g1_label = lst[0].split()[1].split(';')[0] if lst[0].split()[1] != '.' else g1
+ g1_name = lst[0].split()[1] if lst[0].split()[1] != '.' else ''
+ g2 = lst[1].split()[0] # source gene ID
+ g2_label = lst[1].split()[1].split(';')[0] if lst[1].split()[1] != '.' else g2
+ g2_name = lst[1].split()[1] if lst[1].split()[1] != '.' else ''
+ G.add_node(g1, full_name=g1_name, label=g1_label) # if g1 is also a TF, then istf='0' will overwrite it in the following for loop
+ G.add_node(g2, full_name=g2_name, label=g2_label) # tf_category contains default TF category code. It can be modified later given user's input
+
+ G.add_edge(g2, g1, weight=float(lst[2]), strength=strength, method=method_or_tissue) # g2 is source, and g1 is target
+ source_nodes.append(g2)
+
+ f.close()
+
+ source_nodes = list(set(source_nodes))
+ return G, source_nodes
+
+
+## main
+print('Load graph...')
+G, source_nodes = build_network_from_file(sys.argv[1])
+print('Convert to undirected...')
+G2 = G.to_undirected()
+print('Compute diameter...')
+d1 = diameter(G2)
+print('Graph diameter: %d' % (d1))
diff --git a/Code/degree_of_separation2.py b/Code/degree_of_separation2.py
new file mode 100644
index 0000000..fd8d6e7
--- /dev/null
+++ b/Code/degree_of_separation2.py
@@ -0,0 +1,45 @@
+# Usage: python degree_of_deparation.py edges.txt
+# Purpose: get the maximum degree of separation
+import os, sys
+import networkx as nx
+#import util_networkx
+from networkx.algorithms.distance_measures import diameter, eccentricity
+
+def build_network_from_file(edge_fname):
+ G = nx.DiGraph()
+ source_nodes = []
+ f = open(edge_fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '':
+ strength = float(lst[8])
+ method_or_tissue = lst[9]
+ g1 = lst[0].split()[0] # target gene ID
+ g1_label = lst[0].split()[1].split(';')[0] if lst[0].split()[1] != '.' else g1
+ g1_name = lst[0].split()[1] if lst[0].split()[1] != '.' else ''
+ g2 = lst[1].split()[0] # source gene ID
+ g2_label = lst[1].split()[1].split(';')[0] if lst[1].split()[1] != '.' else g2
+ g2_name = lst[1].split()[1] if lst[1].split()[1] != '.' else ''
+ G.add_node(g1, full_name=g1_name, label=g1_label) # if g1 is also a TF, then istf='0' will overwrite it in the following for loop
+ G.add_node(g2, full_name=g2_name, label=g2_label) # tf_category contains default TF category code. It can be modified later given user's input
+
+ G.add_edge(g2, g1, weight=float(lst[2]), strength=strength, method=method_or_tissue) # g2 is source, and g1 is target
+ source_nodes.append(g2)
+
+ f.close()
+
+ source_nodes = list(set(source_nodes))
+ return G, source_nodes
+
+
+## main
+print('Load graph...')
+G, source_nodes = build_network_from_file(sys.argv[1])
+print('Convert to undirected...')
+G2 = G.to_undirected()
+print('Compute eccentricity...')
+for node in G2.nodes():
+ d = eccentricity(G2,v=node)
+ print('%s\t%d' % (node, d))
+
diff --git a/Code/delete_not_used_fastq.py b/Code/delete_not_used_fastq.py
new file mode 100644
index 0000000..67d368b
--- /dev/null
+++ b/Code/delete_not_used_fastq.py
@@ -0,0 +1,43 @@
+# Usage: python delete_not_used_fastq.py
+# Edit DIR, the directory containing all fastq.gz files.
+# Also generate USED_IDS.
+# Purpose: list fastq.gz files that are not used. Move them to to.be.deleted folder.
+#
+# 20 Apr 2017, slcu, hui
+
+import glob, os
+
+def read_ids(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ d[line] = 1
+ f.close()
+ return d
+
+DIR = '/home/hui/network/R/Raw'
+destDIR = os.path.join(DIR, 'to.be.deleted')
+USED_IDS = '/home/hui/network/v03/Data/temp/used.sra.ids.txt' # generated by grep @ /home/hui/network/v03/Data/parameter/parameter_for_buildRmatrix.txt | grep 'SRR\|ERR\|DRR' | perl -pe 'substr($_, 0, 3) = ""; s/X+$//'
+
+
+if not os.path.isdir(destDIR):
+ os.makedirs(destDIR)
+
+ids = read_ids(USED_IDS)
+flst = glob.glob(os.path.join(DIR, '*.gz'))
+
+print('file\tsize.in.G')
+sum = 0
+for path in flst:
+ fname = os.path.basename(path)
+ if '_' in fname:
+ i = fname[0:fname.find('_')]
+ else:
+ i = fname[0:fname.find('.')]
+ if not i in ids:
+ print('%s\t%4.2f' % (path, 1.0*os.path.getsize(path)/(1024*1024*1024)))
+ sum += 1.0*os.path.getsize(path)/(1024*1024*1024)
+ cmd = 'mv %s %s' % (path, destDIR)
+ os.system(cmd)
+print('Total %4.2f G moved to %s' % (sum, destDIR))
diff --git a/Code/download_and_map.py b/Code/download_and_map.py
new file mode 100644
index 0000000..95a4753
--- /dev/null
+++ b/Code/download_and_map.py
@@ -0,0 +1,390 @@
+# Usage: python dowload_and_map.py
+# python download_and_map.py run_ids.txt
+#
+# Edit DAILY_DOWNLOAD_NUMBER and MIN_FILE_SIZE
+#
+# This program checks RNA_SEQ_INFO_FILE for not yet downloaded, *public* RNA-seq data, make a list of them, download and map using Salmon. It is very important to prepare
+# RNA_SEQ_INFO_FILE (see parse_end_xlm.py). In fact, only first column of RNA_SEQ_INFO_FILE is required in this file, that is a list of RNA-seq IDs.
+#
+# Purpose: automate downloading RNA-seq files from ENA or SRA. This program checks MAPPED_RDATA_DIR and RAW_RDATA_DIR to ensure that we are not re-mapping or re-downloading already mapped or downloaded data.
+#
+# Note: use download_data2() to download from SRA, and download_and_map_data() to download from ENA (closer to Cambridge so faster). This script depends on get_TPM_by_salmon.py
+#
+# 23 DEC 2016, hui, slcu. Updated: 9 Feb 2017
+# Last modified 10 APR 2017, hui, slcu
+# Last reviewed 31 July 2018
+
+import os, sys, glob, json
+import fnmatch
+import time
+import re
+from datetime import datetime
+
+##########################################################################################
+from configure import DAILY_MAP_NUMBER, MIN_FASTQ_FILE_SIZE, RNA_SEQ_INFO_FILE, DOWNLOADED_SRA_ID_LOG_FILE, IGNORED_SRA_ID_LOG_FILE, MAPPED_RDATA_DIR, RAW_RDATA_DIR, SALMON_MAP_RESULT_DIR
+
+FASTQ_DUMP_PATH = '/home/hui/software/sratoolkit/sratoolkit.2.8.0-ubuntu64/bin/fastq-dump'
+
+##########################################################################################
+
+def glob_files(directory, pattern):
+ ''' return all file names (without paths) given directory and pattern '''
+ result = []
+ for root, dirnames, filenames in os.walk(directory):
+ for filename in fnmatch.filter(filenames, pattern):
+ result.append(filename)
+ return result
+
+
+def glob_files_include_path(directory, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+
+ for root, dirnames, filenames in os.walk(directory):
+ for filename in fnmatch.filter(filenames, pattern):
+ result.append(os.path.join(root, filename))
+ # all check *.txt files where downloaded files are recorded.
+ for fname in glob.glob(os.path.join(directory, 'download*.txt')):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if fnmatch.fnmatch(os.path.basename(line), pattern):
+ result.append(os.path.join(directory, line))
+ return result
+
+
+def get_list(fname):
+ ''' Convert a file to a list, each line is an element in the list '''
+ if not os.path.exists(fname):
+ return []
+
+ result = []
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ s = lst[0].strip() # SRR, ERR, or DRR id
+ if (not s in d) and ('SRR' in s or 'ERR' in s or 'DRR' in s):
+ d[s] = 1
+ result.append(s)
+ f.close()
+ return result # only return unique elements
+
+
+def make_download_list(all_run_ids, mapped_dir, rna_data_info_dict):
+ '''
+ Make next n sample IDs. These samples must have not been downloaded yet.
+
+ all_run_ids - a list of NextGen-Seq IDs to select from
+ mapped_dir - contain all mapped samples
+ rna_data_info_dict - a dictionary containing all RNA-seq samples from ENA.
+ '''
+
+ result = []
+ mapped_files = glob_files(mapped_dir, '*_quant.txt')
+ mapped_run_ids = get_list(DOWNLOADED_SRA_ID_LOG_FILE)
+ small_ids = get_list(IGNORED_SRA_ID_LOG_FILE) # these files are too small
+ for x in sorted(all_run_ids, reverse=True): # SRR first, then ERR, then DRR
+ include_me = True if x in rna_data_info_dict and rna_data_info_dict[x] > 0 else False
+ if not (x + '_quant.txt') in mapped_files and not x in result and (not x in small_ids) and (not x in mapped_run_ids) and include_me: # not mapped yet and is RNA-seq
+ result.append(x)
+ return result
+
+
+def num_of_digits(s):
+ count = 0
+ for c in s:
+ if c.isdigit():
+ count += 1
+ return count
+
+
+def get_file_url(fname):
+ ''' for wget '''
+ f = open(fname)
+ url_list = []
+ for line in f:
+ line = line.strip()
+ if 'ftp://' in line and '.fastq.gz' in line:
+ lst = line.split()
+ address = lst[-1].strip()
+ if '.fastq.gz' in address and address.startswith('ftp://') and not address in url_list:
+ url_list.append(address)
+ f.close()
+ return url_list
+
+
+def get_file_size(fname):
+ sz = 0
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line.startswith('==> SIZE'):
+ lst = line.split()
+ sz = int(lst[-1])
+ f.close()
+ return sz
+
+
+def get_remote_file_size(link):
+ cmd = 'rm -f ../Data/temp/wget_temp_file1.txt'
+ os.system(cmd)
+
+ cmd = 'wget --spider %s 2> ../Data/temp/wget_temp_file1.txt' % (link)
+ os.system(cmd)
+ return get_file_size('../Data/temp/wget_temp_file1.txt')
+
+
+def get_sample_id(fname):
+ ''' extra id from file name'''
+ index = fname.find('.fastq.gz')
+ if index < 0:
+ return ''
+
+ s = fname[0:index]
+ lst = s.split('_')
+ return lst[0]
+
+
+def download_and_map_data(lst, daily_map_num, dest):
+ ''' Download data from ENA; fast (but can be interruptive) '''
+ downloaded_files = [] # a list of paths to downloaded files, small files (size less than MIN_FASTQ_FILE_SIZE) won't be included in the list
+ map_list = []
+
+ if len(lst) < daily_map_num or daily_map_num < 1:
+ return downloaded_files, map_list
+
+ count = 0
+ for line in lst: # lst - a list of run IDs
+ run_id = line
+ dir1 = line[0:6]
+ dir2 = ''
+ n = num_of_digits(line)
+ address = ''
+ if n == 6: # follow ENA's data path convention
+ address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/' % (dir1, run_id)
+ elif n == 7:
+ dir2 = '00' + run_id[-1]
+ address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)
+ elif n == 8:
+ dir2 = '0' + run_id[-2:]
+ address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)
+ elif n == 9:
+ dir2 = run_id[-3:]
+ address = 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/%s/%s/%s/' % (dir1, dir2, run_id)
+
+ if os.path.exists('../Data/temp/wget_temp_file0.txt'):
+ cmd = 'rm -f ../Data/temp/wget_temp_file0.txt'
+ os.system(cmd)
+
+ cmd = 'wget --spider -T 20 %s 2> ../Data/temp/wget_temp_file0.txt' % (os.path.join(address, '*.gz'))
+ os.system(cmd)
+
+ url_lst = get_file_url('../Data/temp/wget_temp_file0.txt')
+ if url_lst == []:
+ write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
+
+ time.sleep(1)
+
+ curr_lst = []
+ for link in url_lst:
+ sz = get_remote_file_size(link)
+ if sz >= MIN_FASTQ_FILE_SIZE: # remote file must be big enough
+ cmd = 'wget %s -P %s' % (link, dest)
+ os.system(cmd)
+ file_path = os.path.join(dest, os.path.basename(link))
+ curr_lst.append(file_path)
+ downloaded_files.append(file_path)
+ else:
+ print('[download_and_map.py] IGNORE [%d MB] %s' % (int(sz/1000000.0), link))
+ file_name = os.path.basename(link)
+ sample_id = get_sample_id(file_name)
+ write_log_file(IGNORED_SRA_ID_LOG_FILE, sample_id+'\n')
+
+
+ print(curr_lst)
+ if curr_lst != []:
+ salmon_map(curr_lst)
+ map_list.append(run_id)
+ count += 1
+
+ # Remove raw files (as they occupy lots of space)
+ for f in downloaded_files:
+ if os.path.exists(f):
+ print('[download_and_map.py] To save space, I am removing %s.' % (f))
+ os.remove(f)
+ time.sleep(1)
+
+ if count >= daily_map_num:
+ return downloaded_files, map_list
+
+ time.sleep(3)
+
+ return downloaded_files, map_list
+
+
+def download_data2(lst, dest):
+ ''' Download data from SRA, slow '''
+ if not os.path.exists(FASTQ_DUMP_PATH):
+ print('%s not exists.' % (FASTQ_DUMP_PATH))
+ sys.exit()
+
+ downloaded_files = [] # a list of paths to downloaded files, small files (size less than MIN_FASTQ_FILE_SIZE) won't be downloaded
+ for line in lst:
+ run_id = line.strip()
+ cmd = '%s -N 1000000 --split-files --skip-technical %s' % (FASTQ_DUMP_PATH, run_id)
+ print('\n' + cmd)
+ os.system(cmd)
+ if glob.glob('%s*fastq*' % (run_id)) != []: # files are successfully downloaded
+ cmd = 'mv %s*fastq* %s' % (run_id, dest)
+ print(cmd)
+ os.system(cmd)
+ fastq_file_lst = glob.glob( os.path.join(dest, '%s*fastq*' % (run_id)) )
+ if len(fastq_file_lst) == 1: # rename file
+ cmd = 'mv %s %s' % (fastq_file_lst[0], os.path.join(dest, run_id+'.fastq'))
+ os.system(cmd)
+
+ cmd = 'gzip %s' % ( os.path.join(dest, run_id + '*.fastq') )
+ print(cmd)
+ os.system(cmd)
+ for fname in glob.glob( os.path.join(dest, '%s*gz' % (run_id)) ) :
+ downloaded_files.append(fname)
+ else:
+ write_log_file(IGNORED_SRA_ID_LOG_FILE, run_id+'\n')
+
+ return downloaded_files
+
+
+def salmon_map(lst):
+ gz_file = '../Data/temp/gz_files.txt'
+ if os.path.exists(gz_file):
+ cmd = 'rm -f %s' % (gz_file) # remove old parameter file (if any). gz means gzip. fastq files are usually zipped.
+ os.system(cmd)
+
+ f = open('../Data/temp/gz_files.txt', 'w')
+ f.write('\n'.join(lst)) # lst contains paths to fastq files
+ f.close()
+
+ print('Start mapping %s' % (' '.join(lst)))
+ cmd = 'python get_TPM_by_salmon.py ../Data/temp/gz_files.txt > /dev/null 2>&1' # mapped files will be saved in result
+ os.system(cmd)
+
+
+def write_log_file(fname, s):
+ if not os.path.exists(fname):
+ f = open(fname, 'w')
+ else:
+ f = open(fname, 'a')
+ f.write(s)
+ f.close()
+
+
+def last_session_finished(fname):
+ ''' return true if log file ends with DONE. '''
+ if not os.path.exists(fname):
+ return True
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ last_line = lines[-1]
+ if last_line.strip() == '': # a newline
+ print('[download_and_map.py] Last line in file %s is empty. The last line must start with DONE.' % (fname))
+ sys.exit()
+ lst = last_line.split()
+ if lst[0] == 'DONE':
+ return True
+ else:
+ return False
+
+def read_ena_data_info(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ run_id = lst[0]
+ d[run_id] = 1
+ f.close()
+ return d
+
+
+def read_ena_data_info_json(fname):
+ d = {}
+ with open(fname) as json_data:
+ json_dict = json.load(json_data)
+ for run_id in json_dict:
+ d[run_id] = 1
+ return d
+
+
+def read_run_ids_from_file(fname):
+ f = open(fname)
+ lst = []
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ if not line.startswith('#') and 'RR' in line:
+ lst.append(lst[0])
+ f.close()
+ return list(set(lst))
+
+
+
+
+## main
+
+# For filtering RNA-seq data
+if not os.path.exists(RNA_SEQ_INFO_FILE):
+ print('[download_and_map.py] Must provide %s. See parse_ena_xml.py about how to make it.' % (RNA_SEQ_INFO_FILE))
+ sys.exit()
+
+# If there is no enough disk space for storing the downloaded sequencing data, then stop
+available_G = 4 * os.statvfs('/home').f_bavail / (1024*1024) # compute available space (in G). Each block has 4k bytes, work for Linux/UNIX systems only
+if available_G < 3 * DAILY_MAP_NUMBER:
+ print('[download_and_map.py] home directory does not have enough space (only %d G available) ' % (available_G))
+ sys.exit()
+
+if not last_session_finished(DOWNLOADED_SRA_ID_LOG_FILE): # last session not finished
+ print('[download_and_map.py] last downloading and mapping session not finished yet. You can edit file %s to remove last START at.' % (DOWNLOADED_SRA_ID_LOG_FILE))
+ sys.exit()
+
+rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE) # rna_data_info_dict contains only RNA-seq IDs.
+
+# Generate DRR/ERR/SRR ids to download
+if len(sys.argv) > 1: # user has provided a list of IDs in a file
+ download_list = read_run_ids_from_file(sys.argv[1])
+ DAILY_MAP_NUMBER = len(download_list)
+else:
+ print('[download_and_map.py] Prepare download list ...')
+ download_list = make_download_list(rna_data_info_dict.keys(), MAPPED_RDATA_DIR, rna_data_info_dict)
+ print('[download_and_map.py] There are %d run IDs from which you could select %d of them.' % (len(download_list), DAILY_MAP_NUMBER))
+
+
+# Make a record in log.txt
+curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories
+write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'START at %s\n' % (curr_time))
+
+# Download these RNA-seq IDs and map them using salmon
+print('[download_and_map.py] Start downloading and mapping ...')
+downloaded_file_paths, map_list = download_and_map_data(download_list, DAILY_MAP_NUMBER, RAW_RDATA_DIR) # or we can use the function download_data2 to download from SRA (in US).
+
+# Move all files to MAPPED_RDATA_DIR
+curr_time = datetime.now().strftime('%Y-%m-%d_%H%M') # append date info to newly created directories
+new_dir_name = MAPPED_RDATA_DIR
+if not os.path.isdir(new_dir_name):
+ os.makedirs(new_dir_name)
+
+# after mapping is finished, move all resulting files to new_dir_name (MAPPED_RDATA_DIR)
+if glob.glob('%s/*_quant.txt' % (SALMON_MAP_RESULT_DIR.rstrip('/'))) != []:
+ cmd = 'mv %s/*_quant.txt %s' % (SALMON_MAP_RESULT_DIR.rstrip('/'), new_dir_name)
+ os.system(cmd)
+ print('[download_and_map.py] Done. Check directory %s.' % (os.path.abspath(new_dir_name)))
+else:
+ print('[download_and_map.py] No quant files to move.')
+
+
+write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, '%s\n' % ('\n'.join(map_list)))
+write_log_file(DOWNLOADED_SRA_ID_LOG_FILE, 'DONE at %s\n' % (curr_time))
diff --git a/Code/download_ena_metadata.py b/Code/download_ena_metadata.py
new file mode 100644
index 0000000..c88bbfc
--- /dev/null
+++ b/Code/download_ena_metadata.py
@@ -0,0 +1,43 @@
+# Usage: python download_ena_metadata.py
+# Modify LIBRARY_STRATEGY, MIN_READ_COUNT in this file, as filters.
+#
+# Purpose: download read description from ENA website, in the form of xml, to be parsed by parse_ena_xml.py.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+
+import os, sys
+
+TAXID = '3702' # organism Tax ID for arabidopisis. Change it for other organisms.
+LIBRARY_STRATEGY = 'RNA-Seq' # can be ChIP-Seq, or others, see http://www.ebi.ac.uk/ena/submit/reads-library-strategy
+MIN_READ_COUNT = 1000000 # only download for samples having at least this many reads
+RESULT_LIST = ['read_run', 'read_study', 'read_experiment'] # don't modify. See http://www.ebi.ac.uk/ena/data/warehouse/usage
+
+
+def convert_name(s):
+ if s.lower() == 'rna-seq':
+ return 'rnaseq'
+ if s.lower() == 'chip-seq':
+ return 'chipseq'
+ return 'unknownseq'
+
+
+fname_lst = []
+for result in RESULT_LIST:
+ cmd = 'wget \"http://www.ebi.ac.uk/ena/data/warehouse/search?query=%%22tax_eq(%s)%%20AND%%20library_strategy=%%22%s%%22%%20AND%%20read_count%%3E=%s%%22&result=%s&display=xml\" -O result.xml' % (TAXID, LIBRARY_STRATEGY, MIN_READ_COUNT, result) # make a RESTful download link, see http://www.ebi.ac.uk/ena/browse/search-rest
+ print(cmd)
+ os.system(cmd)
+ fname_components = ['ena', convert_name(LIBRARY_STRATEGY), result]
+ fname = '_'.join(fname_components)
+ fname = fname.replace(' ', '_') + '.xml'
+ # remove lines with 'Entry:'
+ cmd = 'sed -i.bak \'/^Entry:/d\' result.xml'
+ os.system(cmd)
+ # rename file
+ cmd = 'mv result.xml %s' % (fname)
+ os.system(cmd)
+ fname_lst.append(fname)
+
+cmd = 'rm -f *.xml.bak'
+os.system(cmd)
+print('Done. Check %s. Move them to Data/information.' % (' '.join(fname_lst)))
diff --git a/Code/exclude_edges.py b/Code/exclude_edges.py
new file mode 100644
index 0000000..1fb5f77
--- /dev/null
+++ b/Code/exclude_edges.py
@@ -0,0 +1,55 @@
+# Usage: python exclude_edges.py edges.txt
+#
+# Purpose: Exclude the edge whose TF is in exclude_tf_list, since we want to hide unpublished data. Don't distinguish +/- edges.
+# For a TF-Target pair, use the edge that has largest value of metric. If correlation is negative, use its absolute value.
+#
+# Created by Hui on 5 Jan 2018
+
+import os, sys
+
+def remove_minus(s):
+ ''' Remove the minus sign in s '''
+ index = s.find('-')
+ return s[index+1:]
+
+
+def neg2pos(s):
+ s = s.strip()
+ lst = s.split('\t')
+ x = float(lst[2])
+ if x < 0:
+ lst[2] = remove_minus(lst[2])
+ return '\t'.join(lst)
+
+
+def make_edge_dict(fname, exclude_lst):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ tf_id = lst[1].split()[0]
+ if not tf_id in exclude_lst:
+ target_id = lst[0].split()[0]
+ metric = float(lst[8])
+ k = tf_id + '.' + target_id
+ if not k in d:
+ d[k] = {}
+ d[k]['metric'] = metric
+ d[k]['line'] = neg2pos(line) # make the third field (correlation) positive if it is negative. Indicate influence, not activation/repression.
+ else:
+ if d[k]['metric'] < metric:
+ d[k]['metric'] = metric
+ d[k]['line'] = neg2pos(line)
+ f.close()
+ return d
+
+
+# main
+exclude_tf_list = ['AT4G26840', 'AT3G18550']
+edge_file_name = sys.argv[1]
+d = make_edge_dict(edge_file_name, exclude_tf_list)
+f = open(edge_file_name, 'w') # this will make edge.txt empty
+for k in sorted(d.keys()):
+ f.write('%s\n' % d[k]['line'])
+f.close()
diff --git a/Code/geneid2name.py b/Code/geneid2name.py
new file mode 100644
index 0000000..6d412f8
--- /dev/null
+++ b/Code/geneid2name.py
@@ -0,0 +1,29 @@
+# Purpose: refactoring.
+# Create on 10 Aug 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+def make_gene_name_AGI_map_dict(fname):
+ '''
+ A dictionary that maps gene id to gene name, and gene name to gene id.
+ fname is ../Data/information/AGI-to-gene-names_v2.txt
+ '''
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ agi = lst[0].upper()
+ name_string = lst[1].upper()
+ d[agi] = name_string
+ for name in name_string.split(';'):
+ if not name in d:
+ d[name] = agi
+ f.close()
+ return d
+
+
+def get_gene_name(gene_id, agi2name_dict):
+ if gene_id in agi2name_dict and agi2name_dict[gene_id] != gene_id:
+ gene_name = agi2name_dict[gene_id]
+ else:
+ gene_name = '.'
+ return gene_name
diff --git a/Code/get_TPM_by_salmon.py b/Code/get_TPM_by_salmon.py
new file mode 100644
index 0000000..a4ce3ff
--- /dev/null
+++ b/Code/get_TPM_by_salmon.py
@@ -0,0 +1,142 @@
+# Usage
+# -----
+# python get_TPM_by_salmon.py file-containing-paths-to-fastq-gz-files
+# An example file-containing-paths-to-fastq-gz-files, gz_files.txt
+# Edit the first few capitalised variables in this file.
+#
+# Purpose
+# ------
+#
+# Build index, get TPM values for all fastq iles.
+#
+# 30 NOV 2016, SLUC, hui
+# Last reviewed 31 July 2018
+# Last modified by Hui 10 Sep 2019
+
+import sys, os, glob, shutil
+from configure import SALMON, SALMON_INDEX, TRANSCRIPTOME, SALMON_MAP_RESULT_DIR, KMER
+
+#TRANSCRIPTOME = '/home/hui/tair10/AtRTD2_19April2016.fa'
+#TRANSCRIPTOME = '../Data/information/ath_genes_index_v2.fa'
+
+def build_salmon_index(transcriptome_file, salmon_index_dir, k):
+ if not os.path.exists(SALMON_INDEX):
+ os.makedirs(SALMON_INDEX)
+ cmd = '%s index -t %s -i %s --type quasi -k %d' % (SALMON, transcriptome_file, salmon_index_dir, k)
+ os.system(cmd)
+
+
+def assert_file_exist(s):
+ if not os.path.exists(s):
+ print('File %s not exists.' % (s))
+ sys.exit()
+
+
+def salmon_fatal_error(fname):
+ ''' Return True iff the file fname contains i wont proceed. '''
+ if not os.path.exists(fname):
+ return False
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if 'I won\'t proceed' in line:
+ return True
+ return False
+
+
+def get_TPM(src_dir, file_id, salmon_index, result_dir):
+ lst = sorted( glob.glob(os.path.join(src_dir, file_id + '*.fastq*')) )
+ lst2 = sorted( glob.glob(os.path.join(src_dir, file_id + '*_*.fastq.gz')) ) # _1.fastq and _2.fastq
+ num_file = len(lst)
+ num_file2 = len(lst2)
+
+ if not os.path.isdir(result_dir):
+ os.makedirs(result_dir)
+
+ dest_dir = os.path.join(result_dir, file_id + '_transcript_quant')
+ if not os.path.isdir(dest_dir):
+ os.makedirs(dest_dir)
+
+ if num_file == 1 and num_file2 < 2: # a single fastq.gz file
+ file_path = lst[0]
+ print(file_path)
+ assert_file_exist(file_path)
+ cmd = '%s quant -i %s -l A -r %s -o %s' % (SALMON, salmon_index, file_path, dest_dir)
+ os.system(cmd)
+ elif num_file2 >= 2:
+ file_path1 = lst2[0]
+ file_path2 = lst2[1]
+ print(file_path1)
+ print(file_path2)
+ assert_file_exist(file_path1)
+ assert_file_exist(file_path2)
+ cmd = '%s quant -i %s -l A -1 %s -2 %s -o %s' % (SALMON, salmon_index, file_path1, file_path2, dest_dir)
+ print(cmd)
+ os.system(cmd)
+ elif num_file2 < 2:
+ print('Warning: skip %s as it has less than two _*.fastq.gz files' % (file_id))
+ return
+ else:
+ print('Warning: skip %s as it has more than two fastq.gz files' % (file_id))
+ return
+
+ output_file_name = os.path.join(result_dir, file_id + '_quant.txt')
+ if os.path.exists( os.path.join(dest_dir, 'quant.sf') ):
+ if not salmon_fatal_error('%s/%s_transcript_quant/logs/salmon_quant.log' % (SALMON_MAP_RESULT_DIR.rstrip('/'), file_id)):
+ cmd = 'cp %s %s' % (os.path.join(dest_dir, 'quant.sf'), output_file_name)
+ os.system(cmd)
+ shutil.rmtree(dest_dir)
+
+
+def get_id(s):
+
+ index = s.find('_1.fastq')
+ if index > 0:
+ return s[:index]
+
+ index = s.find('_2.fastq')
+ if index > 0:
+ return s[:index]
+
+ index = s.find('_3.fastq')
+ if index > 0:
+ return s[:index]
+
+ index = s.find('.fastq')
+ if index > 0:
+ return s[:index]
+
+ return 'NA'
+
+
+def get_src_dir_and_file_id(fname):
+ ''' Return a dictionary where key is SRR/ERR/DRR id, and value is tuple (path, a number) '''
+ result = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ index = line.rfind('/')
+ if index == -1:
+ path = './'
+ else:
+ path = line[:index]
+ id = get_id(line[index+1:])
+ if not id in result:
+ result[id] = (path, 1)
+ else:
+ t = result[id]
+ result[id] = (path, t[1] + 1)
+ f.close()
+ return result
+
+
+### build salmon index
+build_salmon_index(TRANSCRIPTOME, SALMON_INDEX, KMER)
+fname = sys.argv[1] # a file return by find ../Data/R/Raw -name "*.gz"
+src_id = get_src_dir_and_file_id(fname)
+for k in src_id:
+ src_dir = src_id[k][0]
+ file_id = k
+ get_TPM(src_dir, file_id, SALMON_INDEX, SALMON_MAP_RESULT_DIR)
diff --git a/Code/get_binding.py b/Code/get_binding.py
new file mode 100644
index 0000000..e4087ca
--- /dev/null
+++ b/Code/get_binding.py
@@ -0,0 +1,384 @@
+# Usage: python get_binding.py parameter_for_buildCmatrix.txt
+#
+# Manually change CHR_INFO in parameter_for_buildCmatrix.txt if
+# your organism is not Arabidopsis. Set REBUILD_LIST in
+# parameter_for_buildCmatrix.txt if you only want to make
+# binding column for a few ChIP-seq IDs (useful when adding new
+# ChIP-seq data but not wanting to re-generate existing binding
+# files).
+#
+#
+# Purpose: make individual binding column files, one for each ChIP-seq
+# ID. These files will be combined by buildCmatrix.py into a
+# big binding matrix, binding.txt.
+#
+# A typical column file looks like:
+#
+# gene_id C0003000001450
+# AT1G01010 0
+# AT1G01020 0
+# AT1G01030 0
+# AT1G01040 0
+# AT1G01046 0
+# AT1G01050 0
+# AT1G01060 0
+# ...
+#
+# Last modified 5 APR 2017 slcu hui
+# Last modified 4 AUG 2019 zjnu hui
+
+
+import sys, os, operator, bisect
+import numpy as np
+import pyBigWig
+from datetime import datetime
+
+
+####################################
+GLB_PARAM_SYMBOL = '%%'
+LCL_PARAM_SYMBOL = '%'
+DATA_SYMBOL = '@'
+####################################
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+
+def make_chromosome_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ lst = line.split('\t')
+ k = lst[0]
+ v = lst[1]
+ d[k] = int(v)
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_FILE':'', 'TARGET_RANGE':'3000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHR_INFO':{}, 'DESTINATION':'', 'REBUILD_LIST':[] } # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ if k == 'REBUILD_LIST' and v.lower() != 'all':
+ d[k] = v.split() # make a list and rewrite d[k]
+ elif k == 'CHR_INFO' and os.path.exists(v):
+ d[k] = make_chromosome_dict(v)
+ else:
+ d[k] = v
+
+ if len(d['CHR_INFO']) == 0:
+ print('get_binding.py ERROR: must specify chromosome information CHR_INFO in paramter_for_buildCmatrix.txt.')
+ sys.exit()
+
+ f.close()
+ return d
+
+
+# for each dataset, make a column gene_id, sample_id
+def make_chip_data(fname):
+ ''' fname - a narrowPeak file
+ given a file, return a dictionary. key is chromosome number. value is a list (start_pos, end_pos, strength).
+ '''
+
+ d = {}
+ if not os.path.exists(fname):
+ print('get_binding: cannot find file %s' % (fname))
+ sys.exit()
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ if not check_valid_peak_line(lines[0]): # very basic check
+ print('get_binding: %s is not a valid BED file as in the first line the chromosome is not a number or starts with chr. ignored.' % (fname))
+ return d
+ for line in lines:
+ line = line.strip()
+ lst = line.split()
+ c = lst[0]
+
+ rindex = c.lower().rfind('r') # handle chr3 case, get 3 only. If it is Pt or Mt, use it.
+ if rindex != -1:
+ c = c[rindex+1:].strip()
+
+ ss = int(lst[1])
+ ee = int(lst[2])
+ strength = lst[4] # the 5th column is used as strength
+ if not c in d:
+ d[c] = [(ss, ee, strength)]
+ else:
+ d[c].append((ss, ee, strength))
+
+ for k in d:
+ d[k] = sorted(d[k], key=operator.itemgetter(0, 1)) # sort by start position, then by end position
+ return d
+
+
+def make_data_dict(fname):
+ ''' fname is paramter_for_buildCmatrix.txt '''
+
+ d = {'ID_LIST':[]} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('get_binding: ID %s is duplicate. Check paramter_for_buildCmatrix.txt.' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ if os.path.exists(d[s]['LOCATION']):
+ d['ID_LIST'].append(s)
+ else:
+ print('get_binding: I could not find file\n%s\nfor ChIP-seq ID %s. Ignore this ID.' % (d[s]['LOCATION'], s))
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+ elif line.startswith(LCL_PARAM_SYMBOL) and not line.startswith(GLB_PARAM_SYMBOL):
+ make_local_parameter(d[s]['PARAM'], line)
+
+ return d
+
+
+def get_interval(c, s, e, o, flank, chr_info_dict):
+ '''
+ o --- + positive strand, don't include gene body
+ ++ positive strand, include gene body
+ - negative strand, don't include gene body
+ -- negative strand, include gene body
+ * include both sides and gene body
+ '''
+
+ if o == '+':
+ return (np.max([0, s-flank]), s)
+ elif o == '++': # also include gene body
+ return (np.max([0, s-flank]), e)
+ elif o == '-':
+ return (e, np.min([e+flank, chr_info_dict[c]]))
+ elif o == '--':
+ return (s, np.min([e+flank, chr_info_dict[c]]))
+ elif o == '*': # both sides and gene body
+ return ( np.max([0, s-flank]), np.min([e+flank, chr_info_dict[c]]) )
+ return (-1, -1) # should not be here
+
+
+def check_valid_peak_line(s):
+ s = s.strip()
+ lst = s.split()
+ if lst[0].isdigit() or lst[0].lower().startswith('chr'):
+ return True
+ return False
+
+
+def get_interval_intersection(chromosome, start_pos, end_pos, chip_dict):
+ ''' check with a given interval has intersection with peaks from a ChIP-seq '''
+ if len(chip_dict) == 0 or not chromosome in chip_dict:
+ return []
+
+ result = []
+ lst = chip_dict[chromosome] # get a list of intervals in that chromosome
+ n = len(lst)
+ slst, elst, strength_lst = zip(*lst) # make three sub-lists
+ index1 = max(0, bisect.bisect(elst, start_pos)-2) # get start position
+ index2 = min(bisect.bisect(slst, end_pos)+2, n-1) # get end position
+ sublst = lst[index1:index2] # these intervals potentially have intersections with given interval
+ #print('\t\t\tDEBUG sublst length: %d (index1 %d, index2 %d)' % (len(sublst), index1, index2))
+ for t in sublst:
+ ss = t[0]
+ ee = t[1]
+ strength = t[2]
+ if start_pos <= ee and end_pos >= ss:
+ result.append(int(strength))
+ #print('chromosome=%s start_pos=%d end_pos=%d c=%s ss=%d ee=%d' % (chromosome, start_pos, end_pos, c, ss, ee))
+ return result
+
+
+def make_o(c, glb_param_dict):
+ ''' make orientation '''
+ ig = glb_param_dict['INCLUDE_GENE_BODY'].upper() == 'YES' # include gene body
+ both_side = glb_param_dict['BOTH_SIDE'].upper() == 'YES'
+ if both_side:
+ return '*'
+ if ig:
+ return 2*c # also include gene body
+ return c
+
+
+def calc_intensity(t, bw, norm_by_gene_length=0):
+ '''
+ t - a tuple, chromosome, start and end position
+ '''
+ chromosome = t[0]
+ start = t[1]
+ end = t[2]
+ if end < start:
+ print('get_binding.py calc_intensity: start position must be less than end position')
+ sys.exit()
+
+ I = bw.intervals(chromosome, start, end) # a list of intervals
+
+ sum_area = 0.0
+ if I: # is I is not empty
+ for t in I:
+ s0 = max(start, t[0])
+ e0 = min(end, t[1])
+ h = t[2]
+ #print('=== %d %d %g' % (s0, e0, h))
+ sum_area += h * (e0 - s0)
+
+ if norm_by_gene_length != 0:
+ return 1000.0 * sum_area / (end - start)
+ else:
+ return 1.0 * sum_area
+
+
+def get_update_date(s):
+ index = s.find('update:')
+ if index < 0:
+ return None
+ result = s[s.rfind('update:')+7:].strip()
+ if result.isdigit() and len(result) == 6:
+ return result
+ else:
+ return '00000000'
+
+
+def make_table(gene_file, data_dict, glb_param_dict):
+ '''
+ Each line in gene_file contains TAB-separated fields: gene_id, gene_name, chr, start, end, strand, description (optional).
+ '''
+
+ if glb_param_dict['REBUILD_LIST'] == []: # when not specified, use all
+ id_lst = data_dict['ID_LIST']
+ else:
+ id_lst = glb_param_dict['REBUILD_LIST']
+ for c in id_lst:
+ if not c in data_dict['ID_LIST']:
+ print('get_binding: %s has no corresponding ChIP-seq data (narrowPeak or bw)' % (c))
+ sys.exit()
+
+ chip_data_dict = {}
+ for myid in id_lst:
+ chip_file = data_dict[myid]['LOCATION']
+
+ if not os.path.exists(chip_file):
+ print('get_binding: file %s dose not exists.' % (chip_file))
+ sys.exit()
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ chip_data = make_chip_data(chip_file)
+
+ elif data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ chip_data = pyBigWig.open(chip_file)
+
+ else:
+ print('get_binding: data format %s not supported!' % (data_dict[myid]['DATA_FORMAT']))
+ sys.exit()
+
+ chip_data_dict[myid] = chip_data
+
+
+ f = open(gene_file)
+ lines = f.readlines() # lines contain all lines in gene_file.txt
+ f.close()
+
+ dest = glb_param_dict['DESTINATION'] # individual binding column files will be put here
+ if not os.path.isdir(dest):
+ os.makedirs(dest)
+
+ for myid in id_lst: # for each ChIP-seq ID, make a file in DESTINATION with file name such as C0001100007100.txt
+ #print('Processing %s' % (myid))
+ fname = os.path.join(dest, myid + '.txt')
+ if os.path.exists(fname): # this file has been created before. Remake the binding file if and only if the update date in the NOTE: field is greater than the file's modification date.
+ file_modification_time = datetime.fromtimestamp(os.path.getmtime(fname)).strftime('%Y%m%d')
+ file_update_time = get_update_date(data_dict[myid]['NOTE'])
+ if file_update_time == None:
+ continue
+ if file_update_time <= file_modification_time:
+ continue
+
+ f = open(fname, 'w')
+ content = 'gene_id\t%s\n' % (myid)
+
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ gene_id = lst[0]
+ gene_name = lst[1]
+ c = lst[2] # chromosome, a letter, e.g., '1', '2', etc.
+ s = int(lst[3])
+ e = int(lst[4])
+ o = make_o(lst[5], glb_param_dict) # gene oritentation, + or -
+
+ flank = int(glb_param_dict['TARGET_RANGE'])
+ interval = get_interval(c, s, e, o, flank, glb_param_dict['CHR_INFO']) # only consider upstream
+
+ s0 = gene_id
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ intersection_list = get_interval_intersection(c, interval[0], interval[1], chip_data_dict[myid])
+ if intersection_list != []:
+ s0 += '\t' + str(np.max(intersection_list)) # get the maximum value in the interval [change]
+ else:
+ s0 += '\t' + '0'
+
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ t = (c, interval[0], interval[1])
+ try:
+ z = calc_intensity(t, chip_data_dict[myid], 1)
+ except RuntimeError:
+ z = -1.0
+ s0 += '\t' + '%4.2f' % (z)
+
+ content += s0 + '\n'
+
+ f.write(content)
+ f.close()
+
+ # close bw files
+ for myid in id_lst:
+ if data_dict[myid]['DATA_FORMAT'].upper() == 'BW':
+ chip_data_dict[myid].close()
+
+
+
+## main
+param_file = sys.argv[1] # should be parameter_for_buildCmatrix.txt
+global_param_dict = make_global_param_dict(param_file)
+data_dict = make_data_dict(param_file)
+print('get_binding: I will produce binding column files for the following %d ChIP-seq IDs:\n%s.'
+ % (len(data_dict['ID_LIST']), ','.join(data_dict['ID_LIST'])))
+make_table(global_param_dict['GENE_FILE'], data_dict, global_param_dict)
diff --git a/Code/html_network.py b/Code/html_network.py
new file mode 100644
index 0000000..3237c55
--- /dev/null
+++ b/Code/html_network.py
@@ -0,0 +1,942 @@
+# Usage: python html_network.py -f edges.txt -r parameter_for_buildRmatrix.txt -c parameter_for_buildCmatrix.txt -n parameter_for_net.txt
+# Purpose: make a summary.html plus its associated files (stored in folder edges) given an edge file (edges.txt). These files will be served as static files online. The total volumn of these static files can be quite large, as we get one file for each edge.
+#
+# This program is used in update_network.py.
+#
+# Created on 26 Feb 2017, SLCU, Hui
+# Last modified 24 Mar 2017, SLCU, Hui
+# Last modified 21 Apr 2017, SLCU, Hui [w2ui for regulatee and regulator tables]
+# Last modified 19 Jun 2017, SLCU, Hui [changed text_to_dict to fit the updated RNA_SEQ_INFO_DATABASE]
+# Last modified 29 Jun 2017, SLCU, Hui [added key 'sample_id' in text_to_dict]
+# Last reviewed 01 Fen 2019, Hui [code review]
+
+import sys, os
+import networkx as nx # Run this command on MacOS: export PYTHONPATH="/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages
+import numpy as np
+from optparse import OptionParser
+from itertools import islice
+import operator
+from datetime import datetime
+import collections, re, glob
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+## Global variables
+REGENERATE_ALL_EDGE_FILES = 'YES'
+INDEX_PAGE = '../Webapp/static/summary.html' # change
+DIR_NAME = '../Webapp/static/edges' # change
+
+RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt'
+RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
+RNA_SEQ_INFO_HTML_PAGE = 'rnaseqinfo.html'
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+CHIP_SEQ_INFO_HTML_PAGE = 'chipseqinfo.html'
+
+RAKE_STOPLIST_FILE = '../Data/information/SmartStoplist.txt'
+
+JSON_DIR = '../Data/history/expr/json' # move this directory to the same place as this file html_network.py, for gene expression scatterplot
+JSON_DIR2 = '../Data/history/bind/json2' # for displaying binding plots
+C3_DIR = './depend/c3'
+W2UI_DIR = './depend/w2ui'
+C3_FILES = ['c3.min.css', 'c3.min.js', 'd3.min.js', 'scatterplot.js', 'barchart.js'] # for displaying scatterplots and binding strength
+W2UI_FILES = ['jquery.min.for.w2ui.js', 'w2ui.min.js', 'w2ui.min.css']
+ALPHA = 0.6 # weight indicating the importance of number of RNA-seq experiments
+## function definitions
+
+### RAKE rapid automatic keyphrase extraction (NOT USED). Skip it and jump to my function.
+
+def is_number(s):
+ try:
+ float(s) if '.' in s else int(s)
+ return True
+ except ValueError:
+ return False
+
+
+def load_stop_words(stop_word_file):
+ """
+ Utility function to load stop words from a file and return as a list of words
+ @param stop_word_file Path and file name of a file containing stop words.
+ @return list A list of stop words.
+ """
+ stop_words = []
+ for line in open(stop_word_file):
+ if line.strip()[0:1] != "#":
+ for word in line.split(): # in case more than one per line
+ stop_words.append(word)
+ return stop_words
+
+
+def separate_words(text, min_word_return_size):
+ """
+ Utility function to return a list of all words that are have a length greater than a specified number of characters.
+ @param text The text that must be split in to words.
+ @param min_word_return_size The minimum no of characters a word must have to be included.
+ """
+ splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
+ words = []
+ for single_word in splitter.split(text):
+ current_word = single_word.strip().lower()
+ #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
+ if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
+ words.append(current_word)
+ return words
+
+
+def split_sentences(text):
+ """
+ Utility function to return a list of sentences.
+ @param text The text that must be split in to sentences.
+ """
+ sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
+ sentences = sentence_delimiters.split(text)
+ return sentences
+
+
+def build_stop_word_regex(stop_word_file_path):
+ stop_word_list = load_stop_words(stop_word_file_path)
+ stop_word_regex_list = []
+ for word in stop_word_list:
+ word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen
+ stop_word_regex_list.append(word_regex)
+ stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
+ return stop_word_pattern
+
+
+def generate_candidate_keywords(sentence_list, stopword_pattern):
+ phrase_list = []
+ for s in sentence_list:
+ tmp = re.sub(stopword_pattern, '|', s.strip())
+ phrases = tmp.split("|")
+ for phrase in phrases:
+ phrase = phrase.strip().lower()
+ if phrase != "":
+ phrase_list.append(phrase)
+ return phrase_list
+
+
+def calculate_word_scores(phraseList):
+ word_frequency = {}
+ word_degree = {}
+ for phrase in phraseList:
+ word_list = separate_words(phrase, 0)
+ word_list_length = len(word_list)
+ word_list_degree = word_list_length - 1
+ #if word_list_degree > 3: word_list_degree = 3 #exp.
+ for word in word_list:
+ word_frequency.setdefault(word, 0)
+ word_frequency[word] += 1
+ word_degree.setdefault(word, 0)
+ word_degree[word] += word_list_degree #orig.
+ #word_degree[word] += 1/(word_list_length*1.0) #exp.
+ for item in word_frequency:
+ word_degree[item] = word_degree[item] + word_frequency[item]
+
+ # Calculate Word scores = deg(w)/frew(w)
+ word_score = {}
+ for item in word_frequency:
+ word_score.setdefault(item, 0)
+ word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
+ #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
+ return word_score
+
+
+def generate_candidate_keyword_scores(phrase_list, word_score):
+ keyword_candidates = {}
+ for phrase in phrase_list:
+ keyword_candidates.setdefault(phrase, 0)
+ word_list = separate_words(phrase, 0)
+ candidate_score = 0
+ for word in word_list:
+ candidate_score += word_score[word]
+ keyword_candidates[phrase] = candidate_score
+ return keyword_candidates
+
+
+class Rake(object):
+ def __init__(self, stop_words_path):
+ self.stop_words_path = stop_words_path
+ self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
+
+ def run(self, text):
+ sentence_list = split_sentences(text)
+
+ phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)
+
+ word_scores = calculate_word_scores(phrase_list)
+
+ keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
+
+ sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
+ return sorted_keywords
+
+
+### my functions
+
+def get_id(s):
+ lst = s.split(' ')
+ return lst[0]
+
+def get_name(s, agi2name_dict):
+ s = s.strip()
+ if s == '':
+ return '???'
+ if s in agi2name_dict:
+ name = agi2name_dict[s]
+ lst = name.split(';')
+ return lst[0]
+ else:
+ return s
+
+def show_path(G, lst, options):
+ s = ''
+ n = len(lst)
+ count = 0
+ for i in range(n-1):
+ u = lst[i]
+ v = lst[i+1]
+ e = G.get_edge_data(u, v)
+ padding = ''
+ if e['weight'] > 0:
+ s += padding + '%s\t(%s,%2.2f)\t-> ' % (u, e['color'], e['weight']) + ('[%s]\n' % (e['condition']) if options.cond==True else '\n')
+ else:
+ s += padding + '%s\t(%s,%2.2f)\t|| ' % (u, e['color'], e['weight']) + ('[%s]\n' % (e['condition']) if options.cond==True else '\n')
+ count += 4
+ print(s + v)
+ print('')
+
+
+def k_shortest_paths(G, source, target, k, weight=None):
+ return list(islice(nx.shortest_simple_paths(G, source, target, weight=weight), k))
+
+def not_bad_line(s):
+ if s.strip() == '':
+ return False
+ if 'WARNING' in s:
+ return False
+ if 'number' in s:
+ return False
+ if 'Need' in s:
+ return False
+ if 'Error' in s:
+ return False
+ if 'Too' in s:
+ return False
+ if not s.startswith('AT'): # need modification for other organisms
+ return False
+ return True
+
+def build_network_from_file(fname):
+ ''' build the network from the big edge file, edges.txt. '''
+ MG = nx.MultiDiGraph(max_rsubset_size=1400) # maximum size of conditionR list
+
+ max_rsize = 0
+
+ f = open(fname)
+ cond_list = []
+ for line in f:
+ line = line.strip()
+ if not_bad_line(line):
+ lst = line.split('\t')
+ g1 = lst[0].split()[0] # target gene id
+ g2 = lst[1].split()[0] # source gene id
+ MG.add_node(g1)
+ MG.add_node(g2)
+ edge_type = lst[3] # all or mix
+
+ condR_lst = []
+ condC_lst = []
+ model_fit_measure = '?'
+ if len(lst) > 6:
+ condR = lst[4]
+ condR_lst = lst[4].split()
+ condC = lst[5]
+ condC_lst = lst[5].split()
+ model_fit_measure = lst[6]
+ if model_fit_measure == '.' and edge_type == 'mix':
+ model_fit_measure = '-1000.0' # RNA-seq samples were selected using post.translation.3. Search '-1000.0' in QUICKSTART.html for more detail.
+ if '=' in model_fit_measure: # in early days, the log likelihood field looks like loglik=-1234.2
+ model_fit_measure = model_fit_measure.split('=')[1] # remove 'loglik='
+
+ size_condR = len(condR_lst)
+ if size_condR > max_rsize:
+ max_rsize = size_condR
+
+ create_date = '20161201' # default 2016-12-01
+ if len(lst) > 7: # has date information, date information is the 8th column
+ create_date = lst[7]
+
+ metric = float(lst[8]) # appended by update_network.py
+ tissue_or_method = lst[9] # appended by update_network.py
+
+ score = float(lst[2]) # strength of various kinds of relationship.
+
+ # Not sure why I distinguished 'all' and 'mix', as the add_edge statements are the same.
+ if edge_type == 'all':
+ if score > 0:
+ MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+ elif score < 0:
+ MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+ if edge_type == 'mix':
+ if score > 0:
+ MG.add_edge(g2, g1, action='>', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+ elif score < 0:
+ MG.add_edge(g2, g1, action='X', weight=score, metric=metric, conditionR=condR_lst, conditionC=condC_lst, rmse=model_fit_measure, edge_date=create_date, subset=tissue_or_method)
+
+ f.close()
+
+ MG.graph['max_rsubset_size'] = max_rsize
+
+ return MG
+
+
+def get_value(s, delimit):
+ ''' Get the value after the first delimit. '''
+ lst = s.split(delimit, 1) # split by the first delimit
+ return lst[1].strip()
+
+def text_to_dict(fname, ignore_first_line=True):
+ ''' fname is RNA_SEQ_INFO_DATABASE (see above). '''
+ if not os.path.exists(fname):
+ print('html_network.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (fname))
+ sys.exit()
+
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ if ignore_first_line == True:
+ lines = lines[1:]
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ run_id = lst[0]
+ d[run_id] = {} # run_id is ENA/SRA run id
+ d[run_id]['experiment_id'] = lst[2]
+ if len(lst) < 5:
+ continue
+ d[run_id]['project_id'] = lst[4]
+ d[run_id]['sample_id'] = lst[1].split('...')[0]
+ d[run_id]['description'] = '\t'.join(lst[5:])
+ return d
+
+def get_true_run_id(s):
+ s = s[2:] # s looks like R0SRR1548701XX, so 2 is the position of 'S'.
+ index = s.find('X')
+ if index >= 0: # we don't need X
+ return s[0:index]
+ return s
+
+def make_rna_seq_info_dict(fname):
+ db_dict = text_to_dict(RNA_SEQ_INFO_DATABASE)
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ run_id = line[1:] # run_id is sth like R0SRR1548701XX
+ run_id2 = get_true_run_id(run_id)
+ if run_id2 in db_dict:
+ d[run_id] = db_dict[run_id2]
+ else:
+ d[run_id] = {'project_id':'#', 'experiment_id':'#', 'sample_id':'#', 'description':'NA'}
+
+ f.close()
+ return d
+
+
+def make_rna_seq_info_html_page(fname, d):
+ f = open(fname, 'w')
+ f.write('<html><head><style> body {font-family:\"HelveticaNeue-Light\", \"Helvetica Neue Light\", \"Helvetica neue\"} table {table-layout: fixed; width: 800px;}</style></head><body>')
+ for k in sorted(d.keys()):
+ run_link = 'http://www.ebi.ac.uk/ena/data/view/%s' % (get_true_run_id(k))
+ s = '<p><a href=\"%s\" name=\'%s\'>%s</a></p>' % (run_link, k, k)
+ d2 = d[k]
+ s += '<table>'
+ project_link = 'http://www.ebi.ac.uk/ena/data/view/%s' % (d2['project_id'])
+ experiment_link = 'http://www.ebi.ac.uk/ena/data/view/%s' % (d2['experiment_id'])
+ biosample_link = 'http://www.ebi.ac.uk/biosamples/samples/%s' % (d2['sample_id'])
+ description = d2['description']
+ s += '<tr> <td><b>%s</b></td> <td><a href=\"%s\">%s</a> / <a href=\"%s\">%s</a> / <a href=\"%s\">%s</a></td> </tr>' % ('External links', project_link, d2['project_id'], experiment_link, d2['experiment_id'], biosample_link, d2['sample_id'])
+ s += '<tr> <td><b>%s</b></td> <td>%s</td> </tr>' % ('Description', description)
+ s += '</table><br>\n'
+ f.write(s)
+ f.write('</body></html>')
+ f.close()
+
+def make_chip_seq_info_dict(fname):
+ ''' See QUICKSTART.html#parameter-for-buildcmatrix '''
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ experiment_id = line[1:]
+ d[experiment_id] = {}
+ if line.startswith('PROTEIN_ID'):
+ d[experiment_id]['PROTEIN_ID'] = get_value(line, ':')
+ if line.startswith('PROTEIN_NAME'):
+ d[experiment_id]['PROTEIN_NAME'] = get_value(line, ':')
+ if line.startswith('DATA_NAME'):
+ d[experiment_id]['DATA_NAME'] = get_value(line, ':')
+ if line.startswith('DESCRIPTION'):
+ d[experiment_id]['DESCRIPTION'] = get_value(line, ':')
+ if line.startswith('LOCATION'):
+ d[experiment_id]['LOCATION'] = get_value(line, ':')
+ if line.startswith('NOTE'):
+ d[experiment_id]['NOTE'] = get_value(line, ':')
+
+ f.close()
+ return d
+
+
+def make_chip_seq_info_html_page(fname, d):
+ f = open(fname, 'w')
+ f.write('<html><head><style> body {font-family:\"HelveticaNeue-Light\", \"Helvetica Neue Light\", \"Helvetica neue\"} table {table-layout: fixed; width: 800px;}</style></head><body>')
+ for k in sorted(d.keys()):
+ s = '<p><a name=\'%s\'>%s</a></p>' % (k, k)
+ d2 = d[k]
+ s += '<table>'
+ for k2 in sorted(d2.keys()):
+ s += '<tr> <td>%s</td> <td>%s</td> </tr>' % (k2, d2[k2])
+ s += '</table><br>\n'
+ f.write(s)
+ f.write('</body></html>')
+ f.close()
+
+
+def make_link_string_for_cond(s, type):
+ ''' s is a string of RNA-seq IDs or ChIP IDs. '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ if type == 'rnaseq':
+ path = '%s#%s' % (RNA_SEQ_INFO_HTML_PAGE, x)
+ else:
+ path = '%s#%s' % (CHIP_SEQ_INFO_HTML_PAGE, x)
+ result += '<a href=\'%s\'>%s</a> ' % (path, x)
+ return result
+
+
+def get_chip_signal(s, d):
+ ''' extract signal information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('SIGNAL='):
+ result += ';' + y[7:] # 7 means after the '=' in 'SIGNAL='
+ break
+ return word_freq(result)
+
+
+def get_chip_phenotype(s, d):
+ ''' extract phenotype information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = ''
+ for x in lst:
+ desc = d[x]['DESCRIPTION']
+ lst2 = desc.split('\t')
+ for y in lst2:
+ if y.startswith('PHENOTYPE='):
+ result += ';' + y[10:] # 10 means after the '=' in 'PHENOTYPE='
+ break
+ return word_freq(result)
+
+
+def word_freq(s): # for ChIP-seq data
+ ''' s is string. return a string of words ordered by frequency '''
+ if s == '':
+ return ''
+
+ lst = s.split(';')
+ d = {}
+ for x in lst:
+ lst2 = x.split()
+ for y in lst2:
+ #k = y.lower()
+ k = y
+ k = k.strip(',')
+ k = k.strip('.')
+ k = k.strip(')')
+ k = k.strip('(')
+ if not k.lower() in ['at', 'in', 'to', 'with', ',', '.', ':', '-']: # exclude these words
+ if not k in d:
+ d[k] = 1
+ else:
+ d[k] += 1
+
+ sorted_tuples = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ first_items = [x[0] for x in sorted_tuples]
+ return ' '.join(first_items)
+
+
+def word_freq2(lst): # for RNA-seq data
+ ''' s is string. return a string of words ordered by frequency '''
+
+ if lst == []:
+ return ''
+
+ d = {}
+ for x in lst: # each description
+ lst2 = x.split()
+ for y in lst2: # each word
+ k = y
+ k = k.strip(',') # remove superfluous charaters, if any
+ k = k.strip('.')
+ k = k.strip(')')
+ k = k.strip('(')
+ k = k.strip(';')
+ if not k.startswith('SRR') and not k.startswith('ERR') and not k.startswith('DRR') and not k.isdigit() and not ':' in k and len(k) > 1 and not k.lower() in ['just', 'library', 'libraries', 'dna', 'nextseq', 'nextseq500', 'sequencing', 'end', 'al;', 'which', 'analyse', 'analyze', 'analyzer', 'whole-genome', 'thus', 'plant', 'plants', 'future', 'such', 'not', 'alone', 'most', 'within', 'into', 'but', 'between', 'we', 'is', 'or', 'also', 'was', 'can', 'be', 'use', 'kit', 'used', 'et', 'al', 'by', 'this', 'the', 'their', 'at', 'in', 'to', 'on', 'with', ',', '.', ':', '-', 'rna-seq', 'rnaseq', 'of', 'hiseq', 'hiseq2000', 'illumina', 'arabidopsis', 'thaliana', 'from', '<br><br>[title]', '<br><br>[description]', 'using', 'were', 'are', 'and', 'under', 'a', 'an', 'one', 'two', 'three', 'as', 'for', 'after', 'none', 'mapping', 'na', 'whole', 'chip-seq', 'paired']: # exclude these strings
+ if not k in d:
+ d[k] = 1
+ else:
+ d[k] += 1
+
+ sorted_tuples = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ first_items = [x[0] + ' (' + str(x[1]) + ')' for x in sorted_tuples]
+ return '<br>'.join(first_items)
+
+
+def word_freq3(lst): # for RNA-seq data, bag-of-words model
+ ''' similar to word_freq2, but may be faster '''
+ if lst == []:
+ return ''
+
+ bow = [collections.Counter(re.findall(r'\w+', s)) for s in lst] # bag of words
+ d = sum(bow, collections.Counter()) # frequency of each word
+ sorted_tuples = d.most_common(len(d))
+ exclude_lst = ['basis', 'requires', 'population', 'resolution', 'via', 'overall', 'elements', 'grown', 'expression', 'appears', 'total', 'have', 'here', 'of', 'just', 'type', 'transcriptomes', 'transcriptome', 'transcriptomic', 'transcription', 'transcriptional', 'report', 'during', 'diversity', 'investigated', 'library', 'per', 'libraries', '2500', '2000', '1210', '1001', '1107', 'dna', 'nextseq', 'nextseq500', 'seq', 'sequencing', 'sequencing;', 'end', 'al;', 'whereas', 'which', 'analyse', 'analyze', 'analyzer', 'quality', 'analysis', 'analyses', 'whole-genome', 'thus', 'plant', 'plants', 'future', 'such', 'not', 'alone', 'most', 'molecular', 'within', 'into', 'but', 'however', 'between', 'we', 'is', 'origin', 'or', 'also', 'was', 'can', 'be', 'been', 'use', 'kit', 'used', 'et', 'al', 'by', 'this', 'that', 'these', 'the', 'their', 'at', 'in', 'to', 'on', 'with', 'mrna', 'rna', 'rnas', 'rna-seq', 'rnaseq', 'of', 'hiseq', 'hiseq2000', 'illumina', 'arabidopsis', 'thaliana', 'from', 'roles', 'title', 'description', 'using', 'were', 'are', 'and', 'unknown', 'under', 'a', 'an', 'one', 'two', 'three', 'as', 'for', 'found', 'after', 'none', 'mapping', 'na', 'whole', 'chip-seq', 'play', 'paired', 'br', 'future', 'rowan', 'study', 'studies', 'may', 'sample', 'truseq', 'until', 'gene', 'genes', 'genetic', 'genome', 'genomes', 'units', 'its', 'yelina', 'data', 'set', 'tube', 'single-base', 'size', 'room', 'along', 'before', 'several', 'less', 'protocol', 'profiling', 'profiles', 'conditions', 'collection', 'complete', 'reveal', 'given', 'ii', 'isolated', 'described', 'describe', 'na', 'worldwide', 'accessions', 'identify', 'identification'] # exclude these words
+ first_items = [x[0] + ' (' + str(x[1]) + ')' for x in sorted_tuples if x[1] > 2 and len(x[0]) > 1 and not x[0].startswith('SRR') and not x[0].startswith('ERR') and not x[0].startswith('DRR') and not x[0].isdigit() and not ':' in x[0] and not x[0].lower() in exclude_lst]
+ return ' '.join(first_items)
+
+
+def get_rna_signal(s, d):
+ ''' extract RNA-seq signal information, and return the words ordered by frequency '''
+ lst = s.split()
+ result = []
+ MAX_WORDS = 60
+ if lst[0] == '.': # all RNA samples
+ return 'all available signals'
+ for x in lst: # x is an RNA sample ID, words by frequency
+ if x in d:
+ desc = d[x]['description']
+ desc_lst = re.split('<br>', desc)
+ short_lst = []
+ for x in desc_lst:
+ short_lst.extend(x.split())
+ if len(short_lst) > MAX_WORDS: # average english words 5.1, take the first 100 words, should be informative enough. Longer desc require more computation time.
+ short_lst = short_lst[:MAX_WORDS]
+ break
+ # index = desc.find('<br>')
+ # if index > 0:
+ # desc = desc[:index]
+ result.append((' '.join(short_lst)).strip())
+ return word_freq3(result)
+
+
+def get_rna_signal2(s, d): # not very successful, and slow, so NOT used
+ ''' extract RNA-seq signal information, and return the words ordered by frequency '''
+
+ lst = s.split()
+
+ if lst[0] == '.': # all RNA samples
+ return 'all available signals'
+
+ text = ''
+ for x in lst: # x is an RNA sample ID, words by frequency
+ if x in d:
+ desc = d[x]['description']
+ text += desc.strip().rstrip('.') + '. '
+
+ rake = Rake(RAKE_STOPLIST_FILE)
+ keywords = rake.run(text)
+ return '<br>'.join( [ t[0] + ' (' + str(int(t[1])) + ')' for t in keywords ] )
+
+
+def replace_old_html_page(fname, edge_date):
+ ''' If the file fname needs updating, return True. '''
+ if not os.path.exists(fname): # if the file does not exist, it needs updating
+ return True
+
+ # Check all files AT2G43790_AT1G03080_0.html, AT2G43790_AT1G03080_1.html, AT2G43790_AT1G03080_2.html, etc. If any of them is too old, create a new one.
+ index = fname.rfind('_')
+ if index < 0:
+ print('html_network.py: %s has no underscore.' % (fname))
+ sys.exit()
+ fname_part = fname[:index]
+ for fn in glob.glob(os.path.join(fname_part, '*.html')):
+ file_date = datetime.fromtimestamp(os.path.getmtime(fn)).strftime('%Y%m%d')
+ if int(edge_date) - int(file_date) > 1: # edge_date is at least 1 day newer than edge file date
+ return True
+
+ return False
+
+
+def format_date(s):
+ ''' s in the form of 20170419. Return 2017-04-19 '''
+ s = s.strip()
+ if len(s) != 8:
+ return s
+ return s[0:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def make_html_page_for_condition(fname, tf_name, target_name, condRstr, condCstr, edge_date, subset): # important page ***
+
+ ### if the page already exists, and its information is up-to-date, then don't create it again (to save time)
+ if REGENERATE_ALL_EDGE_FILES == 'NO' and not replace_old_html_page(fname, edge_date):
+ return
+
+ d3_library = '<link href=\"./c3.min.css\" rel=\"stylesheet\" /><script src=\"./d3.min.js\"></script><script src=\"./c3.min.js\"></script><script src=\"./scatterplot.js\"></script><script src=\"./barchart.js\"></script>'
+ f = open(fname, 'w')
+ f.write('<html><head> %s <style> body {font-family:\"HelveticaNeue-Light\", \"Helvetica Neue Light\", \"Helvetica neue\"} </style></head><body>' % (d3_library))
+
+ ### RNA-seq
+ f.write('<h2>RNA-seq experiments</h2>')
+ part = os.path.splitext( os.path.basename(fname) )[0] # get file name without extension
+ id_lst = part.split('_')
+ gene1_file = os.path.join('json', id_lst[0] + '.json') # TF
+ gene2_file = os.path.join('json', id_lst[1] + '.json') # target
+
+ f.write('<p>TF is %s %s. Target is %s %s. Edge made on %s. Method: %s.</p>'% (id_lst[0], '' if tf_name == id_lst[0] else tf_name, id_lst[1], '' if target_name == id_lst[1] else target_name, format_date(edge_date), subset))
+ cond_lst_str = str(condRstr.split()) # insert to javascript function call code
+ rnaseq_info_file = os.path.basename(RNA_SEQ_INFO_DATABASE_JSON)
+ s = '<p><a id=\"myLink\" href=\"javascript:void(0);\" onclick=\"drawScatterPlot(\'%s\',\'%s\', \'%s\', %s);\">Click for gene expression scatter-plot</a></p> <p id=\"chart\"></p>' % (gene1_file, gene2_file, rnaseq_info_file, cond_lst_str)
+ f.write(s)
+
+ global glb_rna_seq_info_dict
+ #s = get_rna_signal(condRstr, glb_rna_seq_info_dict) # DISABLED since this is SLOWEST part
+ # if s.startswith('all available'):
+ # f.write('<h3>Signal</h3>' + '<p>' + s + '</p>')
+ # else:
+ # f.write('<h3>Signal</h3> <p>Note: words are ordered by frequency.</p>' + '<p>' + s + '</p>')
+
+ # f.write('<p>%s<p>' % (make_link_string_for_cond(condRstr, 'rnaseq')))
+
+ ### ChIP-seq
+ f.write('<h2>ChIP-seq experiments</h2>')
+ gene1_file = os.path.join('json2', id_lst[0] + '.json') # TF
+ gene2_file = os.path.join('json2', id_lst[1] + '.json' ) # target
+ cond_lst_str = str(condCstr.split())
+ s = '<a id=\"myLink2\" href=\"javascript:void(0);\" onclick=\"drawBarChart(\'%s\',%s);\">Click for plot</a> <p id=\"chart_bind\"></p>' % (gene2_file, cond_lst_str) # display binding strength
+ f.write(s)
+
+ global glb_chip_seq_info_dict
+ s = get_chip_signal(condCstr, glb_chip_seq_info_dict)
+ if s != '':
+ f.write('<h3>Signal</h3> <p>Note: words are ordered by frequency.</p>' + '<p>' + s + '</p>')
+ else:
+ f.write('<h3>Signal</h3>' + '<p>None.</p>')
+
+ s = get_chip_phenotype(condCstr, glb_chip_seq_info_dict)
+ f.write('<h3>Phenotype</h3>' + '<p>' + s + '</p>')
+
+ f.write('<p>%s</p>' % (make_link_string_for_cond(condCstr, 'chipseq')))
+ f.write('</body></html>')
+ f.close()
+
+
+def make_w2ui_table_page(fname, gene_str, download_str, dict_lst_regulates, dict_lst_regulatedby):
+ ''' each element in dict_lst_* must have the form {'strength': '', 'metric': '', 'geneid': '', 'genename': ''} '''
+ start_part = '''
+ <html>
+ <head>
+ <title>%s</title>
+ <script src="./jquery.min.for.w2ui.js"></script>
+ <script src="./w2ui.min.js"></script>
+ <link rel="stylesheet" type="text/css" href="./w2ui.min.css" />
+ <script>
+ $(function() {
+ ''' % (
+ gene_str)
+
+ # the first table showing targets of a TF
+ grid1 = '''
+ $('#grid1').w2grid({
+ name:'grid1',
+ header:'%s regulates',
+ show:{ footer:true, toolbar:true, header:true },
+ columns:[
+ { field:'recid', caption:'No.', size:'50px', sortable:true, resizable:true},
+ { field:'strength', caption:'Corr', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'metric', caption:'Metric', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'geneid', caption:'Gene ID', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'genename', caption:'Gene name', size:'150px', sortable:true, resizable:true, searchable:true }
+ ],
+ records:
+ ''' % (
+ gene_str)
+
+ grid1 += '[\n'
+ i = 1
+ for d in dict_lst_regulates:
+ grid1 += ' {recid:%d, strength:\'%s\', metric:\'%s\', geneid:\'%s\', genename:\'%s\'},\n' % (i, d['strength'], d['metric'], d['geneid'], d['genename'])
+ i += 1
+ grid1 = grid1.rstrip('\n').rstrip(',')
+ grid1 += ']\n'
+ grid1 += '});\n'
+
+ # the second table showing TF's regulators
+ grid2 = '''
+ $('#grid2').w2grid({
+ name:'grid2',
+ header:'%s is regulated by',
+ show:{ footer:true, toolbar:true, header:true },
+ columns:[
+ { field:'recid', caption:'No.', size:'50px', sortable:true, resizable:true},
+ { field:'strength', caption:'Corr', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'metric', caption:'Metric', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'geneid', caption:'Gene ID', size:'150px', sortable:true, resizable:true, searchable:true },
+ { field:'genename', caption:'Gene name', size:'150px', sortable:true, resizable:true, searchable:true }
+ ],
+ records:
+ ''' % (
+ gene_str)
+
+ grid2 += '[\n'
+ i = 1
+ for d in dict_lst_regulatedby:
+ grid2 += ' {recid:%d, strength:\'%s\', metric:\'%s\', geneid:\'%s\', genename:\'%s\'},\n' % (i, d['strength'], d['metric'], d['geneid'], d['genename'])
+ i += 1
+ grid2 = grid2.rstrip('\n').rstrip(',')
+ grid2 += ']\n'
+ grid2 += '});\n'
+
+ end_part = '''
+ });
+ </script>
+ </head>
+ <body>
+ <div id="grid1" style="position:absolute; left:0px; width:49.9%%; height:99%%;">regulatee table</div>
+ <div id="grid2" style="position:absolute; right:0px; width:49.9%%; height:99%%;">regulator table</div>
+ <br/>
+ <div id="download">%s</div>
+ </body>
+ </html>
+ ''' % (
+ download_str)
+
+ result = start_part + grid1 + grid2 + end_part
+ # minify html
+ lst = re.split(r'\s{2,}', result)
+ result = ''.join(lst)
+ f = open(fname, 'w')
+ f.write(result)
+ f.close()
+
+
+def make_html_page(node, G, fname, agi2name_dict):
+ ''' Make html pages for node's successors and predecessors. '''
+ #f.write('<p><a href=%s>Go to index page</a></p>' % ('../summary.html'))
+ #download_str = '<a href=\'%s\'>Download all edges</a>' % ('./edges.txt.zip') add in future
+ download_str = ''
+ gname = get_name(node, agi2name_dict)
+ if node.strip() == gname.strip(): # id only
+ gene_str = node
+ else:
+ gene_str = '%s' % (node + ' ' + gname)
+ N = G.graph['max_rsubset_size']
+
+ predecessors = G.predecessors(node)
+ successors = G.successors(node)
+
+ d1 = {}
+ d2 = {}
+ for n in successors:
+ name = n.split()[0] + '.html'
+ d = G.get_edge_data(node, n) # n is node's target
+ for k in d.keys(): # can have multiple edges between two nodes
+ t = d[k]['action']
+ t = int(np.abs(d[k]['weight'])*10) * t # edge strength
+ R = ' '.join(d[k]['conditionR'])
+ C = ' '.join(d[k]['conditionC'])
+ RMSE = d[k]['rmse']
+ edge_date = d[k]['edge_date']
+ subset = d[k]['subset']
+ info_page = get_id(node) + '_' + get_id(n) + '_' + str(k) + '.html' # node is TF, n is target
+ info_page_path = os.path.join(os.path.dirname(fname), info_page)
+ tf_name = get_name(node, agi2name_dict)
+ target_name = get_name(n, agi2name_dict)
+ make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # ***
+
+ d1[info_page] = float(d[k]['metric'])
+ display_name = n + ' ' + ('' if target_name == n else target_name)
+ d2[info_page] = (t, name, display_name, RMSE)
+
+ # order edges by strength
+ regulatee_dict_lst = []
+ for tpl in sorted(d1.items(), key=operator.itemgetter(1), reverse=True):
+ k = tpl[0]
+ info_page = k
+ t = d2[k][0]
+ name = d2[k][1]
+ n = d2[k][2] # display name
+ RMSE = d2[k][3]
+ #s1 += '<a href=\'%s\' title=\'%s\'>%s</a> <a href=\'%s\'>%s</a><br/>' % (info_page, RMSE, t.rjust(12, '_'), name, n)
+ lst = n.split()
+ geneid = lst[0]
+ genename = '-'
+ if len(lst) > 1:
+ genename = lst[1]
+ regulatee_dict_lst.append({'strength': '<a href=%s title=%s>%s</a>' % (info_page, RMSE, t.rjust(12, '_')), 'geneid': '<a href=%s>%s</a>' % (name, geneid), 'genename': '%s' % (genename), 'metric': '%4.2f' % (d1[k])})
+
+
+ d1 = {}
+ d2 = {}
+ for n in predecessors:
+ name = n.split()[0] + '.html'
+ d = G.get_edge_data(n, node)
+ for k in d.keys():
+ t = d[k]['action']
+ t = int(np.abs(d[k]['weight'])*10) * t # edge strength
+ R = ' '.join(d[k]['conditionR'])
+ C = ' '.join(d[k]['conditionC'])
+ RMSE = d[k]['rmse']
+ edge_date = d[k]['edge_date']
+ subset = d[k]['subset']
+ info_page = get_id(n) + '_' + get_id(node) + '_' + str(k) + '.html' # n is TF, node is target
+ info_page_path = os.path.join(os.path.dirname(fname), info_page)
+ tf_name = get_name(n, agi2name_dict)
+ target_name = get_name(node, agi2name_dict)
+ #if not os.path.exists(info_page_path): # tf->target may already exits, if so don't need to make it again
+ make_html_page_for_condition(info_page_path, tf_name, target_name, R, C, edge_date, subset) # CHANGE ***
+
+ d1[info_page] = float(d[k]['metric'])
+ display_name = n + ' ' + ('' if tf_name == n else tf_name)
+ d2[info_page] = (t, name, display_name, RMSE)
+
+ # order edges by strength
+ regulator_dict_lst = []
+ for tpl in sorted(d1.items(), key=operator.itemgetter(1), reverse=True):
+ k = tpl[0]
+ info_page = k
+ t = d2[k][0]
+ name = d2[k][1]
+ n = d2[k][2]
+ RMSE = d2[k][3]
+ lst = n.split()
+ geneid = lst[0]
+ genename = '-'
+ if len(lst) > 1:
+ genename = lst[1]
+ regulator_dict_lst.append({'strength': '<a href=%s title=%s>%s</a>' % (info_page, RMSE, t.rjust(12, '_')), 'geneid': '<a href=%s>%s</a>' % (name, geneid), 'genename': '%s' % (genename), 'metric': '%4.2f' % (d1[k])})
+
+ make_w2ui_table_page(fname, gene_str, download_str, regulatee_dict_lst, regulator_dict_lst) # ***
+
+
+def num_lines(fname):
+ ''' Return number of lines in file fname. '''
+ f = open(fname)
+ n = len(f.readlines())
+ f.close()
+ return n
+
+
+## main program
+parser = OptionParser()
+parser.add_option('-f', '--file', dest='edge_file', help='edge file', metavar='FILE')
+parser.add_option('-r', '--rnaseqinfo', dest='rna_seq_info_file', default='', help='RNA-seq information file', metavar='FILE')
+parser.add_option('-c', '--chipseqinfo', dest='chip_seq_info_file', default='', help='ChIP-seq information file', metavar='FILE')
+parser.add_option('-n', '--networkpara', dest='network_para_file', default='', help='Network parameter file', metavar='FILE')
+parser.add_option('-i', '--includeedgetype', dest='include', default='all', help='include edge types')
+parser.add_option('-s', '--showcondition', dest='cond', action="store_true", default=False, help='show correlated conditions')
+(options, args) = parser.parse_args()
+
+glb_param_dict = make_global_param_dict(options.network_para_file)
+
+agi2name_dict = make_gene_name_AGI_map_dict(glb_param_dict['GENE_ID_AND_GENE_NAME'])
+
+total_num_edges = num_lines(options.edge_file)
+
+
+# Make summary.html page
+G = build_network_from_file(options.edge_file)
+if not os.path.isdir(DIR_NAME):
+ os.makedirs(DIR_NAME)
+
+# Make RNA-seq information page
+if options.rna_seq_info_file != '':
+ glb_rna_seq_info_dict = make_rna_seq_info_dict(options.rna_seq_info_file)
+ make_rna_seq_info_html_page(os.path.join(DIR_NAME, RNA_SEQ_INFO_HTML_PAGE), glb_rna_seq_info_dict)
+
+# Make ChIP-seq information page
+if options.chip_seq_info_file != '':
+ glb_chip_seq_info_dict = make_chip_seq_info_dict(options.chip_seq_info_file)
+ make_chip_seq_info_html_page(os.path.join(DIR_NAME, CHIP_SEQ_INFO_HTML_PAGE), glb_chip_seq_info_dict)
+
+# Fill in static index page
+findex = open(INDEX_PAGE, 'w')
+findex.write('<html><head><style> body {font-family:\"HelveticaNeue-Light\", \"Helvetica Neue Light\", \"Helvetica neue\"} table {table-layout: fixed; width: 800px;}</style></head><body>')
+curr_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+s = '<h2>All genes considered</h2>'
+s += '<p>Last updated at %s. A total of %d edges.</p>' % (curr_time, total_num_edges)
+for n in sorted(G.nodes()): # for each node in the network, find its neighbours.
+ t = n.split()[0] + '.html'
+ filepath = os.path.join(DIR_NAME, t)
+
+ successors = G.successors(n)
+ predecessors = G.predecessors(n)
+
+ s1 = ''
+ for sn in successors:
+ t1 = sn.split()[0] + '.html'
+ filepath1 = os.path.join(DIR_NAME.split('/')[-1], t1)
+ s1 += '<a href=\'%s\'>%s</a><br>' % (filepath1, sn)
+
+ s2 = ''
+ for pn in predecessors:
+ t2 = pn.split()[0] + '.html'
+ filepath2 = os.path.join(DIR_NAME.split('/')[-1], t2)
+ s2 += '<a href=\'%s\'>%s</a><br>' % (filepath2, pn)
+
+ s += '<p>Gene:<a href=\'%s\'>%s</a><br>' % (filepath, n)
+ s += '<table border=1><tr><td width=400px>Regulated by %d</td><td width=400px>Regulates %d</td></tr>' % (len(predecessors), len(successors))
+ s += '<tr> <td valign=\"top\">%s</td> <td valign=\"top\">%s</td></tr>' % (s2, s1)
+ s += '</table>'
+ s += '</p>'
+
+ make_html_page(n, G, filepath, agi2name_dict)
+
+findex.write(s)
+findex.write('</body></html>')
+findex.close()
+
+# copy auxiliary folders and files
+if os.path.isdir(JSON_DIR):
+ cmd = 'cp -r %s %s' % (JSON_DIR, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: Omit JSON directory (for displaying gene expression).')
+
+if os.path.isdir(JSON_DIR2):
+ cmd = 'cp -r %s %s' % (JSON_DIR2, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: Omit JSON directory 2 (for displaying binding).')
+
+if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ cmd = 'cp %s %s' % (RNA_SEQ_INFO_DATABASE_JSON, DIR_NAME)
+ os.system(cmd)
+else:
+ print('[WARNING] html_network.py: %s does not exists. Scatterplots may not work properly.' % (RNA_SEQ_INFO_DATABASE_JSON))
+
+for fname in C3_FILES:
+ fpath = os.path.join(C3_DIR, fname)
+ if os.path.exists(fpath):
+ cmd = 'cp %s %s' % (fpath, DIR_NAME)
+ os.system(cmd)
+ else:
+ print('[WARNING] html_network.py: Omitted %s. Scatter plot may not work without this file. ' % (fpath))
+
+for fname in W2UI_FILES:
+ fpath = os.path.join(W2UI_DIR, fname)
+ if os.path.exists(fpath):
+ cmd = 'cp %s %s' % (fpath, DIR_NAME)
+ os.system(cmd)
+ else:
+ print('[WARNING] html_network.py: Omit %s. Table may not work without this file. ' % (fpath))
+
+#print('html_network.py done!')
diff --git a/Code/json_test.py b/Code/json_test.py
new file mode 100644
index 0000000..c4ca7bd
--- /dev/null
+++ b/Code/json_test.py
@@ -0,0 +1,8 @@
+import json
+old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
+with open(old_json) as json_data:
+ json_dict = json.load(json_data)
+ for k in json_dict:
+ print(k)
+ # if k in tissue_dict:
+ # json_dict[k]['tissue'] = tissue_dict[k]
diff --git a/Code/knn_classify.R b/Code/knn_classify.R
new file mode 100644
index 0000000..46df992
--- /dev/null
+++ b/Code/knn_classify.R
@@ -0,0 +1,79 @@
+## Usage: change file names in the section # Paramters
+## Purpose: Classify tissues using KNN. Use tSNE to reduce the dimensionality of each tissue to 2 first.
+##
+## 7 June 2017, slcu, hui
+## Last modified 20 June 2017, slcu, hui
+
+# Parameters
+TRAIN_DATA_FILE <- '../Data/history/expr/TPM.txt'
+TRAIN_CLASS_FILE <- '../Data/information/experiment.and.tissue.2.txt'
+
+K <- 1
+PERPLEXITY <- 50 # for tSNE
+
+# Load data
+#cat('Load TPM.txt ...\n')
+X <- read.table(TRAIN_DATA_FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+
+Z <- read.table(TRAIN_CLASS_FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels <- as.vector(Z$suggested.tissue)
+
+unknown.index <- which(labels == "unknown") # remove unknowns
+
+X.2 <- X[, unknown.index] # test data (label unknown)
+labels <- labels[-unknown.index]
+X <- X[, -unknown.index]
+
+
+labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+sul <- sort(unique(labels)) # sorted unique labels
+colors <- rainbow(length(sul))
+names(colors) <- sul
+
+# Filter rows
+#cat('Filter ...\n')
+rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average
+sd.val <- apply(X, 1, sd)
+sd.tau <- summary(sd.val)[3] # genes whose gene expression varies least are to be filtered
+index <- rowSums(X) > rowsum.tau & sd.val > 10
+
+n.train <- dim(X)[2]
+X.3 <- log(cbind(X[index,], X.2[index,]) + 1)
+n.test <- dim(X.2)[2]
+n <- dim(X.3)[2]
+
+# Learn
+library(Rtsne)
+library(class)
+set.seed(100)
+#cat('Dimensionality reduction using tSNE ...\n')
+tsne <- Rtsne(t(X.3), check_duplicates=F, dims=2, perplexity=PERPLEXITY, theta=0.5, verbose=FALSE, max_iter=600) # dimensionality reduction
+train.data <- cbind(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2])
+
+# Train and test on the same data
+cl <- factor(labels) # class labels
+result <- knn(train.data, train.data, cl, k=K, prob=TRUE)
+#cat(sprintf('Training accuracy: %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# Cross-validation on training data
+result <- knn.cv(train.data, cl, k=K, prob = TRUE)
+#cat(sprintf('Test accuracy (leave-one-out cross-validation): %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# If test data is available, make prediction.
+test.data <- cbind(tsne$Y[(n.train+1):n,1], tsne$Y[(n.train+1):n,2])
+result <- knn(train.data, test.data, cl, k=K, prob=TRUE)
+df <- data.frame(sample.name=colnames(X.2), predicted.tissue=result)
+write.table(df, '../Data/temp/predicted.label.txt', quote=F, sep='\t', row.names=F)
+
+# Plot
+#pdf('../Data/temp/fig.pdf')
+#par(mar=c(5.1, 4.1, 4.1, 8.8), xpd=TRUE)
+#plot(tsne$Y, xlab='tsne X', ylab='tsne Y', cex=.4, col='grey')
+#points(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2], col=colors[labels], cex=.4)
+##text(tsne$Y[,1], tsne$Y[,2], labels, cex=0.1)
+#legend("topright", inset=c(-0.4,0), sul, col=colors, pch=16)
+#dev.off()
+
diff --git a/Code/local_network.py b/Code/local_network.py
new file mode 100644
index 0000000..216832f
--- /dev/null
+++ b/Code/local_network.py
@@ -0,0 +1,1114 @@
+# Usage: python local_network.py
+#
+# Put this file under directory Code
+# Prepare a paramter_for_buildCmatrix.txt and put it under dictory Data/parameter
+# Execute the above command regularly.
+#
+# Edit Webapp/start_webapp.py, uncomment app.run(debug=True) and comment out the previous app.run(). To display the network, cd Webpap && python start_webapp.py.
+# Enter http://127.0.0.1:5000 in net browser.
+#
+# Note:
+# Tested on a 32-core server at slcu running Ubuntun. The program may slow down a personal computer.
+# The program will check that the following required packages are installed.
+# Required python packages: numpy, networkx, flask.
+# Required R packages: rjson, mixtools, Rtsne.
+# To install a python package, use the command pip install numpy.
+# To install an R package, issue this command in R: install.packages('Rtsne', dependencies=TRUE, repos='http://cran.rstudio.com/')
+# In a Mac, use export PYTHONPATH="/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages" to make installed python packages usable.
+#
+# Purpose: make a gene regulatory network locally. Periodically (e.g., per day) run this script to see if the network needs update. If yes, update it.
+# You need to prepare and edit parameter_for_buildCmatrix.txt manually.
+#
+# Created 1 July 2017, hui.lan@slcu.cam.ac.uk, slcu
+# Last modified 4 July 2017, hui lan, slcu
+
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+from param4net import make_global_param_dict, get_key_value
+
+FORCE_MAKE_EDGES = 'NO'
+
+CODE_DIR = os.getcwd() # Get current working directory. It is important that you execute this script under it directory.
+
+# DON'T CHANGE THE FOLLOWING PATHS AND NAMES
+HISTORY_DIR = '../Data/history/edges/many_targets' # each edge file contains edges for many targets
+HISTORY_DIR2 = '../Data/history/edges/one_target' # edges.txt.* files are here, all edge files have the name edges.txt.*, the leading string 'edges.txt' must be present.
+FILE_TIMESTAMP = '../Data/log/file_timestamp.txt'
+SAMPLE_SIZE_FILE = '../Data/log/total.samples.txt' # each line contains a date and the number of samples on and after that date
+TEMP_DIR = '../Data/temp'
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+PARAMETER_FOR_BUILDRMATRIX = '../Data/parameter/parameter_for_buildRmatrix.txt'
+PARAMETER_FOR_NET = '../Data/parameter/parameter_for_net.txt'
+EDGE_FILE = '../Data/history/edges/edges.txt'
+BINDING_FILE = '../Data/history/bind/binding.txt'
+TPM_FILE = '../Data/history/expr/TPM.txt'
+LOG_FILE = '../Data/log/update.network.log.txt'
+NEW_OR_UPDATED_CHIP_FILE = '../Data/log/new.or.updated.chip.file.txt'
+RNA_SEQ_INFO_DATABASE = '../Data/information/rnaseq_info_database.txt'
+RNA_SEQ_INFO_DATABASE_JSON = '../Data/information/rnaseq_info_database.json'
+
+FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX,
+ PARAMETER_FOR_BUILDRMATRIX,
+ PARAMETER_FOR_NET,
+ EDGE_FILE,
+ BINDING_FILE,
+ TPM_FILE]
+
+## help functions
+
+def ok_webapp_dir(para_for_net):
+ ''' we are now under Code '''
+ glb_param_dict = make_global_param_dict(para_for_net)
+ # genes.json is not here, create one
+ if not os.path.exists('../Webapp/static/json/genes.json'):
+ print('genes.json not here, make one')
+ cmd = 'python text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
+ os.system(cmd)
+
+def make_paths(s):
+ if not os.path.isdir(s):
+ os.makedirs(s)
+
+def make_important_dirs():
+ make_paths('../Data/history/edges/many_targets')
+ make_paths('../Data/history/edges/one_target')
+ make_paths('../Data/log')
+ make_paths('../Data/information')
+ make_paths('../Data/temp')
+ make_paths('../Data/parameter')
+ make_paths('../Data/R/Mapped')
+ make_paths('../Data/R/Mapped/public')
+ make_paths('../Data/R/Mapped/inhouse')
+ make_paths('../Data/R/Mapped/other')
+ make_paths('../Data/R/Raw')
+ make_paths('../Data/C/Mapped')
+ make_paths('../Data/C/Mapped/Columns')
+ make_paths('../Data/C/Raw')
+ make_paths('../Data/history/edges')
+ make_paths('../Data/history/bind')
+ make_paths('../Data/history/expr')
+ make_paths('../Webapp/static/json')
+ make_paths('../Webapp/static/edges')
+ make_paths('../Webapp/templates')
+
+def num_line(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+def record_new_or_updated_chip_ids(lst, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y%m%d')
+ for x in lst:
+ f.write('%s\t%s\n' % (x, curr_time))
+ f.close()
+
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ print(s)
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+
+def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
+ if not os.path.exists(sample_size_file):
+ f = open(sample_size_file, 'w')
+ else:
+ f = open(sample_size_file, 'a')
+ f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
+ f.close()
+
+def age_of_file(fname):
+ st = os.stat(fname)
+ days = (time.time() - st.st_mtime)/(3600*24.0)
+ return days
+
+def hold_on(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[:100]: # check the first 100 lines for HOLDON
+ line = line.strip()
+ if line.startswith('%%HOLDON=YES'):
+ return True
+ return False
+
+def all_files_present(lst):
+ missing_file_lst = []
+ for path in lst: # lst is a list of file names to check
+ if not os.path.exists(path):
+ if 'edges.txt' in path:
+ write_log_file('WARNING: must have %s to update network.' % (path), LOG_FILE)
+ missing_file_lst.append(path)
+ return missing_file_lst
+
+def record_file_time(lst):
+ f = open(FILE_TIMESTAMP, 'w')
+ s = ''
+ for x in lst:
+ if os.path.exists(x):
+ s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
+ else:
+ s += '%s\t%d\n' % (os.path.basename(x), 0)
+ f.write(s)
+ f.close()
+
+
+def read_file_timestamp(ftimestamp):
+ d = {}
+ f = open(ftimestamp)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ fname = lst[0]
+ t = lst[1]
+ d[fname] = int(t)
+
+ f.close()
+ return d
+
+def file_updated(fname, d):
+ ft = int(os.stat(fname).st_mtime)
+ k = os.path.basename(fname)
+ return ft > d[k]
+
+
+def get_updated_files(lst, d):
+ result = []
+ for x in lst:
+ if file_updated(x, d):
+ result.append(os.path.basename(x))
+ return result
+
+
+def not_bad_line(s):
+ if s.strip() == '':
+ return False
+ if 'WARNING' in s:
+ return False
+ if 'number' in s:
+ return False
+ if 'Need' in s:
+ return False
+ if 'Error' in s:
+ return False
+ if 'Too' in s:
+ return False
+ if not s.startswith('AT'): # comment out this test if the organism is not Arabidopsis CHANGE
+ return False
+ return True
+
+
+def get_rcond_string(s):
+ s = s.strip()
+ if s.startswith('R0000DRR') or s.startswith('R0000ERR') or s.startswith('R0000SRR'):
+ s = s.replace('R0000DRR', 'R0DRR').replace('R0000SRR', 'R0SRR').replace('R0000ERR', 'R0ERR') # remove extra 0's introduced in earlier edges.txt
+ return s
+
+
+def compute_metric(d):
+ '''
+ d has the form {'freq':0, 'total_RNAseq_ID':0, 'sum_abs_R':0,
+ 'most_recent_date':'20161201'}.
+
+ Metric is a combination of sevaral quantities, average absolute
+ correlation coefficient, average number of RNA-seq data used,
+ frequency of this edge, and recentness. It is used to rank edges.
+ So larger correlation cofficients based on more RNA-seq data that
+ frequenly appear and recent will be ranked on top.
+
+ Formula: avg.abs(r)*log(avg.RN)*log(F+1)*(CurrentDate-MostRecentDate)^-0.2
+
+ '''
+ avg_abs_r = 1.0 * d['sum_abs_R'] / d['freq']
+ log_avg_RN = np.log10(1.0 * d['total_RNAseq_ID'] / d['freq'])
+ log_freq = np.log2(d['freq'] + 1)
+ S = 200.0 # strength of memory, larger value means better memory
+ recentness = np.exp(1.0*(int(d['most_recent_date']) - int(datetime.now().strftime('%Y%m%d')))/S) # a monotonic decreasing function exp(-t/S), exponential nature of forgetting
+ return avg_abs_r * log_avg_RN * log_freq * recentness
+
+
+def make_sample_size_dict(fname):
+ if not os.path.exists(fname):
+ return {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ for line in lines:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ lst = line.split('\t')
+ d[lst[0]] = int(lst[1])
+ return d, sorted(d.keys())
+
+
+def get_sample_size(d, sorted_keys, day):
+ if len(d) == 0:
+ return 1200 # a default number of sample size, CHANGE
+
+ for x in sorted_keys:
+ if x >= day:
+ return d[x]
+
+ k = sorted_keys[-1] # last key, latest date
+ return d[k]
+
+
+def merge_edges(file_lst, edge_fname, sample_size_file):
+
+ '''
+ Write to edge_fname.
+
+ file_lst -- a list of edges.txt.*, computed using different sets
+ of data. Each edges.txt file has the same format.
+
+ This function merges all edges together. Correlation based on
+ larger number of RNA-seq experiments is favoured. Each edge has
+ the following format: target_gene tf_gene value type
+ RNA_experiments ChIP_experiments loglik date, and metric. metric is newly computed.
+
+ '''
+
+ sample_size_dict, sample_size_keys = make_sample_size_dict(sample_size_file)
+
+ ll_dict = { # loglikehood to tissue or method
+ '-999.0':'seedling',
+ '-998.0':'meristem',
+ '-997.0':'root',
+ '-996.0':'leaf',
+ '-995.0':'flower',
+ '-994.0':'shoot',
+ '-993.0':'seed',
+ '-992.0':'stem',
+ '-990.0':'aerial',
+ '-991.0':'hclust',
+ '-1000.0':'wedge (post.translation.3)',
+ '-1001.0':'wedge (post.translation.4)',
+ '.':'all'
+ }
+
+ d = {} # hold edges, best will be kept for mix pos, mix neg and all.
+ d_mix_pos = {} # for computing rank metric 'freq':0, 'total_RNAseq_ID':0, 'sum_abs_R':0, 'most_recent_date':'20161201'
+ d_mix_neg = {}
+ d_all = {}
+ d_tissue = {} # hold subset information (tissue or method)
+ max_rcond_size = 10
+ for fname in file_lst: # check each edge file
+ if not os.path.exists(fname):
+ write_log_file('%s missing.' % (fname), LOG_FILE)
+ continue
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t') # fields are separated by TABs
+ if not_bad_line(line) and len(lst) >= 7: # an edge line must have at least 7 fields
+
+ target_id = lst[0].split()[0] # AGI id only, no gene name
+ tf_id = lst[1].split()[0] # same as above
+ k = target_id + '_' + tf_id # key
+ date = '20161201' # edge creation date. if there's no date field, use 20161201
+ rcond_s = get_rcond_string(lst[4]) # remove extra 0's from old (obsolete) RNA-seq experiment ids
+ loglik = lst[6]
+ t = lst[3] # current line edge type, mix or all
+ if '=' in loglik:
+ loglik = loglik.split('=')[1]
+ if t == 'mix' and loglik == '.': # post translation case, i.e., post.translation.3, see create_edegs4.py
+ loglik = '-1000.0'
+
+ tissue_or_method_name = ll_dict[loglik] if loglik in ll_dict else 'MixReg'
+
+ if len(lst) == 8: # has a date field
+ date = lst[7]
+
+ # add for each edge an tissue or method information. For example, the edge is based on a certain tissue, or is derived by a certain method. ll_dict contains the mapping.
+ if not k in d_tissue:
+ d_tissue[k] = [tissue_or_method_name]
+ else:
+ d_tissue[k].append(tissue_or_method_name)
+
+ if not k in d: # first edge to be added between two nodes: target_id and tf_id
+ d[k] = [{'target':lst[0], 'tf':lst[1], 'value':lst[2], 'type':lst[3], 'rcond':rcond_s, 'ccond':lst[5], 'loglik':loglik, 'date':date}] # a list of dicitionaries, {target, tf, value, type, rcond, ccond, loglik, date}
+ else: # two nodes can have multiple edges. an edge already exists, determine whether or not add this new edge.
+ v = lst[2] # current line value
+ rcond = rcond_s
+ ccond = lst[5]
+ len_r = len(rcond.split()) # number of RNA-seq experiment ids in current line. if it is a dot, then length is 1
+ ignore = False # assume this edge is to be included. will be set False if otherwise.
+
+ for i in range(len(d[k])): # search each of already added edges. If current one is better, replace the old one
+ xd = d[k][i] # xd is a dictionary, representing an edge {target, tf, type, value, rcond, ccond, loglik, date}
+ fv = float(v) # value (confidence) of the candiadte edge
+ fx = float(xd['value'])
+ len_rx = len(xd['rcond'].split())
+ if xd['type'] == t and t == 'all': # edge type is all, use most recent result as the edge is based on all RNA-seq experiments.
+ ignore = True # either ignore it or replace the old with this one
+ if date > xd['date']:
+ d[k][i]['value'] = v
+ d[k][i]['date'] = date
+ break
+
+ if xd['type'] == t and t == 'mix': # current line represents a better edge, i.e., larger r value and based on more RNA-seq experiments
+ if fv*fx > 0 and abs(fv*np.log10(len_r)) > abs(fx*np.log10(len_rx)): # fv*fx > 0 means they have the same sign.
+ d[k][i]['value'] = v
+ d[k][i]['rcond'] = rcond
+ d[k][i]['ccond'] = ccond
+ d[k][i]['loglik'] = loglik
+ d[k][i]['date'] = date
+ ignore = True
+ break
+ elif fv*fx > 0: # curr line has same sign, but based on less RNA-seq experiment, ignore it.
+ ignore = True
+ break
+
+ if v == xd['value'] and len_r == len_rx: # ChIPs are updated, but based on same number of rna-seq experiments
+ if xd['ccond'] != ccond:
+ merged_cond = xd['ccond'] + ' ' + ccond
+ merged_cond_str = ' '.join(sorted(list(set(merged_cond.split()))))
+ d[k][i]['ccond'] = merged_cond_str
+
+ if xd['date'] < date:
+ d[k][i]['date'] = date
+ d[k][i]['loglik'] = loglik
+ ignore = True
+ break
+
+ if ignore == False:
+ d[k].append({'target':lst[0], 'tf':lst[1], 'value':lst[2], 'type':lst[3], 'rcond':rcond_s, 'ccond':lst[5], 'loglik':loglik, 'date':date})
+
+ # fill d_mix_pos, d_mix_neg and d_all
+ curr_rcond_size = len(rcond_s.split())
+ if t == 'mix':
+ if float(lst[2]) >= 0: # lst[2] is value
+ if not k in d_mix_pos:
+ d_mix_pos[k] = {'freq':1, 'total_RNAseq_ID':curr_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_mix_pos[k]['freq'] += 1
+ d_mix_pos[k]['total_RNAseq_ID'] += curr_rcond_size
+ d_mix_pos[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_mix_pos[k]['most_recent_date']:
+ d_mix_pos[k]['most_recent_date'] = date
+ else:
+ if not k in d_mix_neg:
+ d_mix_neg[k] = {'freq':1, 'total_RNAseq_ID':curr_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_mix_neg[k]['freq'] += 1
+ d_mix_neg[k]['total_RNAseq_ID'] += curr_rcond_size
+ d_mix_neg[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_mix_neg[k]['most_recent_date']:
+ d_mix_neg[k]['most_recent_date'] = date
+
+
+ if curr_rcond_size > max_rcond_size:
+ max_rcond_size = curr_rcond_size
+
+ if t == 'all':
+ all_rcond_size = get_sample_size(sample_size_dict, sample_size_keys, date)
+ if not k in d_all:
+ d_all[k] = {'freq':1, 'total_RNAseq_ID':all_rcond_size, 'sum_abs_R':abs(float(lst[2])), 'most_recent_date':date}
+ else:
+ d_all[k]['freq'] += 1
+ d_all[k]['total_RNAseq_ID'] += all_rcond_size
+ d_all[k]['sum_abs_R'] += abs(float(lst[2]))
+ if date > d_all[k]['most_recent_date']:
+ d_all[k]['most_recent_date'] = date
+
+ # rewrite all total_RNAseq_ID value, the total RNAseq ID is actually an overestimate.
+ if len(sample_size_dict) == 0: # sample size file is not available
+ for k in d_all:
+ d_all[k]['total_RNAseq_ID'] = d_all[k]['freq'] * max_rcond_size
+
+ f = open(edge_fname, 'w')
+ for k in sorted(d.keys()):
+ lst = d[k] # a list of dictionaries
+ tissue_or_method = ', '.join(list(set(d_tissue[k])))
+ for xd in lst:
+ if xd['type'] == 'all':
+ metric = '%4.2f' % ( compute_metric(d_all[k]) )
+ s = '\t'.join([xd['target'], xd['tf'], xd['value'], xd['type'], xd['rcond'], xd['ccond'], xd['loglik'], xd['date'], metric, tissue_or_method]) + '\n'
+ if xd['type'] == 'mix':
+ if float(xd['value']) >= 0:
+ metric = '%4.2f' % ( compute_metric(d_mix_pos[k]) )
+ else:
+ metric = '%4.2f' % ( compute_metric(d_mix_neg[k]) )
+ s = '\t'.join([xd['target'], xd['tf'], xd['value'], xd['type'], xd['rcond'], xd['ccond'], xd['loglik'], xd['date'], metric, tissue_or_method]) + '\n'
+ f.write(s)
+ f.close()
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+def make_data_dict(fname):
+ d = {} # keep a list of chip id's, such as C0001100007100
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#'):
+ continue
+ if line.startswith('@'):
+ s = line[line.rfind('@')+1:]
+ s = s.strip()
+ if s in d:
+ write_log_file('In make_data_dict: ID %s duplicated' % (s), LOG_FILE)
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_bad_chip_ids(d):
+ ''' a id chip is bad if its Note field has obsolete '''
+ lst = []
+ for k in d:
+ note = d[k]['NOTE'].lower()
+ if 'obsolete' in note:
+ lst.append(k)
+ return lst
+
+def get_update_date_chip_ids(d):
+ ''' Return a list of ChIP ids with update in the NOTE: field. '''
+ ud = {}
+ for k in d:
+ note = d[k]['NOTE'].lower()
+ if 'update' in note:
+ if 'update:' in note: # has a specific date, e.g., 20170101
+ idx = note.find('update:')
+ udate = note[idx+7:idx+15] # get date string yyyymmdd
+ else: # if only update, but no specific date, assume it is 20161201
+ udate = '20161201'
+ ud[k] = udate
+ return ud
+
+
+def get_chip_ids_from_edge_file(fname):
+ lst = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ l = line.split('\t')
+ c = l[5] # ids for ChIP experiments
+ l2 = c.split()
+ lst.extend(l2)
+
+ f.close()
+ return list(set(lst))
+
+def get_nonexistent_chip_ids(d, fname):
+ ''' fname -- edges.txt. Get ids that are in fname, but not in d. '''
+ lst = []
+ ids = get_chip_ids_from_edge_file(fname)
+ for k in ids:
+ if not k in d:
+ lst.append(k)
+ return lst
+
+
+def created_after_update(x, created_date, update_date_chip_ids):
+ ''' the edge is created after the ChIP is updated.'''
+ if not x in update_date_chip_ids: # update_date_chip_ids dose not contain id x, which means x is not updated since ChIP id is created
+ return True
+ else:
+ return created_date >= update_date_chip_ids[x] # check whether edge is created after x is updated.
+
+
+def rm_chip_ids_from_edge_file(fname, bad_id_lst, update_date_chip_ids):
+ ''' fname -- edges.txt '''
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ f = open(fname, 'w')
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ c = lst[5] # chip experiment ids
+ l = c.split() # list of chip experiment ids
+ created_date = lst[7]
+ l2 = [] # for keeping good chip experiment ids
+ for x in l:
+ if not x in bad_id_lst and created_after_update(x, created_date, update_date_chip_ids): # if an edge is created before this ChIP experiment is updated, then the binding information and thus the edge may be no longer valid. If this ChIP experiment is the sole evidence of binding, then this edge is ignored.
+ l2.append(x)
+ if l2 != []: # still have some chips
+ lst[5] = ' '.join(l2)
+ f.write('\t'.join(lst) + '\n')
+ f.close()
+
+def make_file(fname, s):
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+def get_chip_ids(fname):
+ if not os.path.exists(fname):
+ print('ERROR: %s not exists.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ head_line = lines[0].strip()
+ lst = head_line.split('\t')
+ return lst[1:]
+
+
+def is_recent(note, ndays): # less than 2-days old
+ idx = note.find('update:')
+ if idx == -1: # not found
+ return False
+ udate = note[idx+7:idx+15] # get date string yyyymmdd
+ curr_date = datetime.now().strftime('%Y%m%d')
+ d1 = datetime.strptime(udate, "%Y%m%d")
+ d2 = datetime.strptime(curr_date, "%Y%m%d")
+ return (d2 - d1).days <= ndays
+
+
+def get_note_date(s):
+ ''' s has this format: NOTE: update:20170101 '''
+ if not 'update:' in s: # if people forgot to put an update date in Note:, then it is 20161201
+ return '20161201'
+ index = s.rfind('update:')
+ if len(s[index+7:]) < 8:
+ return '20161201'
+ result = s[index+7:index+15].strip()
+ if result <= '20170101' or len(result) < 8:
+ return '20161201'
+ return result
+
+
+def is_recently_updated(note, chip_id, fname):
+ ''' fname keeps track of chip ids and their most recent update date '''
+ if not is_recent(note, 15): # if update happened 15 days ago, just ignore it. no further action will be taken for this chip id.
+ return False
+
+ note_date = get_note_date(note)
+
+ log_date = '20161201'
+ if os.path.exists(fname): # search update date of chip_id, if it has been incorperated, then log date should be no less than note's update date
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line != '':
+ lst = line.split()
+ if len(lst) >= 2 and lst[0] == chip_id:
+ log_date = lst[1]
+ f.close()
+
+ return note_date > log_date
+
+def get_new_chip_ids(old_lst, para_c_dict, new_or_updated_chip_fname):
+ result = []
+ for k in para_c_dict:
+ note = para_c_dict[k]['NOTE'].lower()
+ if 'obsolete' in note:
+ continue
+ if (not 'obsolete' in note and not k in old_lst) or (k in old_lst and 'update:' in note and is_recently_updated(note, k, new_or_updated_chip_fname)): # First case: the ChIP-seq ID is not in old list and not marked obsolte. Definitely add it. A second case is more subtle. The narrowPeak of an old ChIP-seq ID has recently been updated for whatever purposes. This update should happen reasonably recently. Why is_recently_updated is important? Because we don't want to treat ChIP-seq that is updated weeks ago as new. new_or_updated_chip_fname keeps track of newly updated ChIP-seq and their update dates.
+ result.append(k)
+ return sorted(result)
+
+def edit_file(fname, line_starter, new_str):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ f = open(fname, 'w')
+ for line in lines:
+ if line.startswith(line_starter):
+ s = line_starter
+ s += new_str
+ f.write(s + '\n')
+ else:
+ f.write(line)
+ f.close()
+
+def number_rnaseq_id(tpm_file):
+ f = open(tpm_file)
+ first_line = f.readlines()[0]
+ f.close()
+ first_line = first_line.strip()
+ return len(first_line.split()) - 1
+
+def number_rnaseq_diff(para_file, tpm_file):
+ ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
+ a = 0
+ f = open(para_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ a += 1
+ f.close()
+
+ b = number_rnaseq_id(tpm_file)
+
+ return a - b
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1)
+ return lst[1].strip()
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one ChIP-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_buildrmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ sys.exit()
+
+ if not 'GENE_LIST' in d:
+ print('Must specify GENE_LIST.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one RNA-seq.')
+ sys.exit()
+
+def validate_parameter_for_net(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_ID_AND_GENE_NAME':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'EXPRESSION_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_MATRIX':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ #print('Use python buildCmatrix.py parameter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if k == 'EXPRESSION_MATRIX':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ print('Use python buildRmatrix.py parameter_for_buildRmatrix.txt to create TPM.txt.')
+
+ if not 'GENE_LIST' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'GENE_ID_AND_GENE_NAME' in d:
+ print('Must specify GENE_ID_AND_GENE_NAME.')
+ sys.exit()
+ if not 'BINDING_INFO' in d:
+ print('Must specify BINDING_INFO.')
+ sys.exit()
+ if not 'EXPRESSION_INFO' in d:
+ print('Must specify EXPRESSION_INFO.')
+ sys.exit()
+ if not 'BINDING_MATRIX' in d:
+ print('%s not exists.' % (v))
+ print('Use python buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if not 'EXPRESSION_MATRIX' in d:
+ print('%s not exists.' % (v))
+ print('Use python buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+
+def file_contains(fname, s):
+ if not os.path.exists(fname):
+ return False
+ f = open(fname)
+ for line in f:
+ if s in line:
+ return True
+ f.close()
+ return False
+
+def check_required_packages():
+ # mixtools, networkx, rjson, flask
+
+ # python libraries
+ try:
+ import numpy
+ except ImportError, e:
+ print('numpy not available. Install it via: pip install numpy')
+ sys.exit()
+
+ try:
+ import networkx
+ except ImportError, e:
+ print('networkx not available. Install it via: pip install networkx')
+ sys.exit()
+
+ try:
+ import flask
+ except ImportError, e:
+ print('flask not available. Install it via: pip install flask')
+ sys.exit()
+
+ # R libraries
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'mixtools\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package mixtools not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'rjson\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package rjson not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+ cmd = 'echo \"is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]); is.installed(\'Rtsne\')\" > ../Data/temp/check.r.library.R && Rscript ../Data/temp/check.r.library.R > ../Data/temp/check.r.library.result'
+ os.system(cmd)
+ if not file_contains('../Data/temp/check.r.library.result', 'TRUE'):
+ print('R package Rtsne not available. Install it first.')
+ sys.exit()
+ os.remove('../Data/temp/check.r.library.result')
+
+## main
+## Shipped with this distribution: TPM.txt.gz, experiment.and.tissue.txt, rnaseq_info_database.json
+## BINDING_FILE='../Data/history/bind/binding.txt' is not given, as users should provide a BED file to create it.
+
+if not os.path.isdir(CODE_DIR): # make sure that CODE_DIR exists
+ print('ERROR: %s does not exists.' % (CODE_DIR))
+ sys.exit()
+os.chdir(CODE_DIR) # run this file at the Code directory
+if not os.path.isdir('../Data/C/Mapped'):
+ make_important_dirs() # make important directories (if non-existentt) for holding all kinds of files, must be after os.chdir(CODE_DIR)
+ write_log_file('Copy BED files to Data/C/Mapped. Update the LOCATION field in Data/parameter/parameter_for_buildCmatrix.txt. Run local_network.py again.' , LOG_FILE)
+ sys.exit()
+ok_webapp_dir(PARAMETER_FOR_NET) # make sure Webapp contains necessary files
+
+# BEFORE WE START, WE SHOULD CHECK REQUIRED SOFTWARES ARE INSTALLED.
+# For example, numpy, mixtools, networkx, rjson, flask (TBD)
+# make sure required packages are available
+check_required_packages()
+
+# check rnaseq_info_database.txt and rnaseq_info_database.json
+if not os.path.exists(RNA_SEQ_INFO_DATABASE):
+ write_log_file('NEED CREATE %s.' % (RNA_SEQ_INFO_DATABASE), LOG_FILE)
+ sys.exit()
+if not os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ write_log_file('NEED CREATE %s.' % (RNA_SEQ_INFO_DATABASE_JSON), LOG_FILE)
+ sys.exit()
+
+# make sure parameter files are present and valid (very rudimentary check but important)
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+#validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
+validate_parameter_for_net(PARAMETER_FOR_NET)
+
+# remove binding file, if any
+if os.path.exists(BINDING_FILE):
+ os.remove(BINDING_FILE)
+
+# update edges.txt, a merged file from several sources, HISTORY_DIR and HISTORY_DIR2.
+edge_file_lst = [] # collect edge files.
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+
+# make sure all needed files are present, if not, make them if possible
+miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
+if (miss_lst != [] and edge_file_lst == []) or FORCE_MAKE_EDGES == 'YES':
+ write_log_file('Cannot find these required files: %s. The program will prepare them.' % (', '.join(miss_lst)), LOG_FILE)
+
+ # initially, we only have three parameter files, but not binding.txt
+ important_miss_number = 0
+ if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_BUILDCMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_BUILDRMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_NET in miss_lst:
+ print('Must prepare %s first' % (PARAMETER_FOR_NET))
+ important_miss_number += 1
+
+ if important_miss_number > 0:
+ sys.exit() # need to provide all the above three files; otherwise cannot proceed
+
+ target_tf_fname = '../Data/information/target_tf.txt'
+ if os.path.exists(target_tf_fname):
+ os.remove(target_tf_fname)
+
+ if BINDING_FILE in miss_lst:
+ write_log_file('Make initial binding.txt ...', LOG_FILE)
+ cmd = 'python get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
+ os.system(cmd)
+ cmd = 'python buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
+ os.system(cmd)
+ #print('IMPORATNT:Now check BINDING_MATRIX in %s is set %s and rerun local_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
+ #sys.exit()
+
+ if TPM_FILE in miss_lst:
+ if not os.path.exists(TPM_FILE + '.gz'):
+ write_log_file('Cannot find %s. Try %s.' % (TPM_FILE + '.gz', TPM_FILE), LOG_FILE)
+ if not os.path.exists(TPM_FILE):
+ sys.exit()
+ else:
+ write_log_file('Unzip initial TPM.txt', LOG_FILE)
+ cmd = 'gunzip %s' % (TPM_FILE + '.gz')
+ os.system(cmd)
+
+ #print('IMPORTANT:Now check EXPRESSION_MATRIX in %s is set %s and rerun local_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
+ #sys.exit()
+
+ miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
+ if (len(miss_lst2) == 1 and miss_lst2[0] == EDGE_FILE) or (len(miss_lst2) == 0 and os.path.getmtime(EDGE_FILE) < os.path.getmtime(BINDING_FILE)): # all other files are ready except edges.txt, make one.
+ # should assert the files needed by the following scripts are present
+ # big correlation matrix may not eat all computer's memory, so need to modify the code. or test the user's memory first.
+ print('Make some edges, wait ... Change MAX_PROCESS in Code/create_edges4.py to change the number of processes. Default=10.')
+ cmd = 'nohup python create_edges4.py %s &' % (PARAMETER_FOR_NET) # this will create target_tf.txt needed by the following scripts
+ #os.system(cmd)
+ time.sleep(10)
+
+ # wait or make target_tf.txt
+ wait_sec = 0
+ WAIT_SECONDS = 120
+ while not os.path.exists(target_tf_fname):
+ time.sleep(WAIT_SECONDS)
+ wait_sec += WAIT_SECONDS
+ if wait_sec > WAIT_SECONDS * 5:
+ write_log_file('Make Data/information/target_tf.txt', LOG_FILE)
+ cmd = 'python make_target_tf.py %s > %s' % (PARAMETER_FOR_NET , target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+ os.system(cmd)
+ break
+
+ time.sleep(5)
+ write_log_file('Create group-specific (fixed) edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript correlation_per_group_fixed_number.R &'
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create SIMPLE edges.txt using new TPM.txt (size=%d). SIMPLE means using all RNA-seq samples.' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'python create_edges0.py %s &' % (PARAMETER_FOR_NET) # use all samples in TPM.txt results will be written to history/edges.txt.simple.correlation.all.conditions
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create tissue-specific edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'python create_edges0B.py %s &' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create group-specific edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript correlation_per_group.R &'
+ os.system(cmd)
+
+ time.sleep(5)
+ write_log_file('Create wedge-shape edges.txt using new TPM.txt (size=%d).' % (number_rnaseq_id(TPM_FILE)), LOG_FILE)
+ cmd = 'Rscript wedge.R &'
+ os.system(cmd)
+
+
+# make json (sliced TPM.txt) and json2 (sliced binding.txt) if they don't exist
+if os.path.exists(TPM_FILE):
+ if not os.path.isdir('../Data/history/expr/json'):
+ write_log_file('Make directory ../Data/history/expr/json', LOG_FILE)
+ cmd = 'python slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+if os.path.exists(BINDING_FILE):
+ if not os.path.isdir('../Data/history/bind/json2'):
+ write_log_file('Make directory ../Data/history/bind/json2', LOG_FILE)
+ cmd = 'python slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+ elif os.path.getmtime('../Data/history/bind/json2') < os.path.getmtime(BINDING_FILE):
+ write_log_file('Make directory ../Data/history/bind/json2', LOG_FILE)
+ cmd = 'python slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+# if the file timestamp does not exist, create one
+if not os.path.exists(FILE_TIMESTAMP):
+ record_file_time(FILE_LIST_TO_CHECK)
+
+# get update time of must-have files
+timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)
+
+# update edges.txt, a merged file from several sources, HISTORY_DIR and HISTORY_DIR2.
+edge_file_lst = [] # collect edge files.
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+if edge_file_lst == []:
+ write_log_file('No files to merge. Run this script again a few hours later.', LOG_FILE)
+ sys.exit()
+
+
+
+# merge edge files
+write_log_file('Merge edge files ...', LOG_FILE)
+merge_edges(edge_file_lst, EDGE_FILE, SAMPLE_SIZE_FILE) # merge individual files to EDGE_FILE, so EDGE_FILE is always updated. A new field, metric, is appended.
+
+# delete edges if their supporting ChIP expriments become obsolete
+#write_log_file('Delete edges ...', LOG_FILE)
+para_c_dict = make_data_dict(PARAMETER_FOR_BUILDCMATRIX)
+bad_chip_ids = get_bad_chip_ids(para_c_dict) # e.g., those marked with obsolete in parameter_for_buildCmatrix.txt
+nonexistent_chip_ids = get_nonexistent_chip_ids(para_c_dict, EDGE_FILE) # in edges.txt but not in parameter_for_buildCmatrix.txt, either because it is removed or commented out
+bad_chip_ids.extend(nonexistent_chip_ids)
+bad_chip_ids = list(set(bad_chip_ids)) # unique elements
+update_date_chip_ids = get_update_date_chip_ids(para_c_dict) # a dictionary, get the update date of each ChIP experiment with update in Note field.
+rm_chip_ids_from_edge_file(EDGE_FILE, bad_chip_ids, update_date_chip_ids) # edges.txt is updated, with bad chip ids removed.
+nlines = num_line(EDGE_FILE)
+write_log_file('Number of total edges %d.' % (nlines), LOG_FILE)
+if nlines == 0:
+ write_log_file('Empty edges.txt.', LOG_FILE)
+ sys.exit()
+
+
+# get a list of updated files
+# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+updated_file_list = ['edges.txt']
+# check edges.txt, if updated, re-make static html summary
+if 'edges.txt' in updated_file_list: # if edges.txt is updated
+ write_log_file('Rebuild html pages ...', LOG_FILE)
+ cmd = 'python html_network.py -f %s -r %s -c %s -n %s' % (EDGE_FILE, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_NET) # produce edges and summary.html
+ os.system(cmd)
+
+ # update webapp folder
+ cmd = 'cp %s ../Webapp/static/edges/' % (EDGE_FILE)
+ os.system(cmd)
+
+ # kill and restart start_webapp.py
+ # write_log_file('Terminate start_webapp ...', LOG_FILE)
+ # cmd = 'kill $(ps aux | grep \'[p]ython start_webapp\' | awk \'{print $2}\')'
+ # os.system(cmd)
+ # write_log_file('Restart start_webapp ...', LOG_FILE)
+ # os.chdir('../Webapp')
+ # os.system('python start_webapp.py &')
+ # os.chdir(CODE_DIR) # return to CODE_DIR
+
+
+# update time stamp file
+record_file_time(FILE_LIST_TO_CHECK)
+
+# remove .R files in Data/temp, files older than 3 days will be removed
+cmd = 'find %s -mtime +1 -name \"*.R\" -delete' % (TEMP_DIR)
+os.system(cmd)
+
+write_log_file('Network update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), LOG_FILE)
+
+print('Creating edges... Wait one hour to have edges ready.\nUncomment app.run(debug=True) in Webapp/start_webapp.py and comment out the previous app.run().\nTo display the network, cd Webpap && python start_webapp.py. Enter http://127.0.0.1:5000 in the address bar in your browser.')
diff --git a/Code/make_graphviz_file3B.py b/Code/make_graphviz_file3B.py
new file mode 100644
index 0000000..3ccd870
--- /dev/null
+++ b/Code/make_graphviz_file3B.py
@@ -0,0 +1,236 @@
+# Usage: python make_graphviz_file3B.py AT1G19850
+#
+# Make plot: python make_graphviz_file3B.py AT1G65480 | dot -Tpdf -o result.pdf result.gv
+# python make_graphviz_file3B.py AT1G65480 | neato -Goverlap=false -Tpdf -o result.pdf result.gv
+#
+# The plot is saved in result.pdf, and each little grey box contains a tissue name.
+# Change 'pdf' to 'svg' to get a vector image. Tissue name is in yellow box. Double circle represents both a regulator and a regulatee.
+# Egg represents a regulatee. Oval represent a regulator. Yellow arrow regulating. Red arrow being regulated.
+#
+# Input file is specified in variable edge_file (result.skeleton.txt). This file is generated by test_network4.py.
+# The tissue name is contained in the lines starting with '##', e.g., '##TF skeleton size in shoot: 15735.' contains 'shoot'.
+# Edit the variable tissue_colour_dict and tissue_lst in function get_tissue_from_fname() to match with the tissue names.
+#
+#
+# Purpose: Generate result.gv for Graphviz software dot. The single
+# parameter AT1G19850 is a TF. result.gv contains all edges from/to the TF
+# in each tissue. A tissue is a subgraph. We can
+# convert result.gv to a figure using 'dot -Tpdf -o result.pdf
+# result.gv'.
+#
+# Created 6 July 2017, hui, slcu
+# Last modified 11 July 2017, hui, slcu
+
+import random
+import numpy as np
+import sys
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+NUM_TARGETS_CUTOFF = 5
+
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+
+def get_edge(fname):
+ ''' Return d = {'flower':{'tf':[target1,target2, ...]}, 'seed':{}} '''
+ d = {}
+ d2 = {} # the actual correlation coefficient, absolute value
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not line.startswith('#'):
+ lst = line.split('\t')
+ target = (lst[0].split('_'))[0]
+ tf = (lst[1].split('_'))[0]
+ if not tf in d[tissue]:
+ d[tissue][tf] = [target]
+ else:
+ d[tissue][tf].append(target)
+
+ strength = abs(float(lst[2]))
+ if not tf in d2[tissue]:
+ d2[tissue][tf] = {target:strength}
+ else:
+ d2[tissue][tf][target] = strength
+
+ else:
+ tissue = get_tissue_from_fname(line)
+ d[tissue] = {}
+ d2[tissue] = {}
+ f.close()
+ return d, d2
+
+
+def in_same_tissue(source, target, node_dict):
+ return node_dict[source] == node_dict[target]
+
+def make_label(a, b):
+ if b == '.':
+ return a
+ else:
+ lst = b.split(';')
+ return a + ' ' + lst[0]
+
+
+def has_predecessor(tf, d):
+ for k in d:
+ if tf in d[k] and k != tf:
+ return True
+ return False
+
+def get_num_successors(tf, d):
+ if not tf in d:
+ return 0
+ return len(d[tf])
+
+def get_shape(tf, d):
+ ''' d = {'tf':[target1, target2]} '''
+ p = has_predecessor(tf, d)
+ s = get_num_successors(tf, d)
+ if s > 0 and p: # tf is both a regulator and a regulatee
+ return 'doublecircle'
+ if s > 0 and not p: # a regulator
+ return 'oval' # regulator
+ if p and s == 0: # a regulatee
+ return 'egg' # regulatee
+ return 'point'
+
+def get_color(tf, edge_dict, tissue):
+ #colours = ['darkolivegreen1', 'darkolivegreen2', 'darkolivegreen3', 'darkolivegreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4', 'darkgoldenrod', 'darkgoldenrod4']
+ #colours = ['snow', 'snow1', 'snow2', 'snow3', 'snow4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ colours = ['springgreen', 'springgreen1', 'springgreen2', 'springgreen3', 'springgreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4'] # darker colours means more important for that tissue
+ d = {}
+ total = 0
+ for k in edge_dict:
+ n = get_num_successors(tf, edge_dict[k])
+ d[k] = n
+ total += n
+ #print('%s %d' % (k, n))
+ if total == 0: # no successor
+ return 'azure'
+ return colours[min(int(10 * 1.0 * d[tissue] / total), len(colours)-1)]
+
+def write_graphviz_file(fname, edge_dict, colour_dict, agi2name_dict, query_tf):
+
+ f = open(fname, 'w')
+
+ graph_dict = {} # record for each tissue the graph
+ last_node = {} # record the last node added in each subgraph
+ for k in edge_dict:
+ graph_dict[k] = {'head':'', 'nodes':[], 'edges':[]}
+
+ for k in edge_dict: # k is tissue
+ node_added_dict = {} # make sure we don't add the same node twice
+ edge_added_dict = {} # make sure an edge is not added twice
+ tissue_node = '%s_node' % (k)
+ graph_dict[k]['head'] = ''
+ d = edge_dict[k] # d = {'tf1':[target1, target2, ...]}
+ tf_lst = d.keys()
+ for tf in tf_lst:
+ node_tf = tf + '_' + k
+ if tf == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k) # shape's boundary colour
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ for target in d[tf]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ node_target = target + '_' + k
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, k)
+ if not target in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[k], shape))
+ node_added_dict[target] = 'YES'
+ last_node[k] = node_target
+
+ edge_key = tf + target
+ if not edge_key in edge_added_dict:
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'gold')) # out-going edge
+ edge_added_dict[edge_key] = 'YES'
+
+ else: # check if tf is a target of another tf
+ for target in d[tf]:
+ if target == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ node_tf = tf + '_' + k
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ node_target = target + '_' + k
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ last_node[k] = node_target
+ edge_key = tf + target
+ if not edge_key in edge_added_dict:
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'red'))
+
+ if graph_dict[k]['nodes'] != []:
+ node_label = k + '_label_node'
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", shape=box, color=yellow, style=filled, height=0.8, width=1.6];\n' % (node_label, k.upper()))
+
+ # write graphviz file
+ s0 = 'digraph G {\n graph[splines=true, ranksep=2, fontname=Arial];\n node[fontname=Arial];\n'
+ s0 += ' {rank=sink; ' # move label node to bottom
+ for k in last_node:
+ if graph_dict[k]['nodes'] != []:
+ node_label = k + '_label_node'
+ s0 += '%s;' % (node_label)
+ s0 += '}\n'
+ for k in graph_dict:
+ s0 += graph_dict[k]['head']
+ node_label = k + '_label_node'
+ for x in graph_dict[k]['nodes']:
+ s0 += x
+ for x in graph_dict[k]['edges']:
+ s0 += x
+ if k in last_node:
+ s0 += ' \"%s\" -> \"%s\" [arrowhead=none, style=invis];\n' % (last_node[k], node_label)
+
+ s0 += '}\n'
+ f.write(s0)
+ f.close()
+
+
+# main
+
+GENE_ID_TO_GENE_NAME = '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file = 'result.skeleton.txt' # prepared by test_network4.py
+
+tissue_colour_dict = {
+ 'seedling':'greenyellow',
+ 'meristem':'skyblue4',
+ 'flower':'lightpink',
+ 'aerial':'cyan',
+ 'shoot':'forestgreen',
+ 'seed':'black',
+ 'leaf':'green',
+ 'root':'gold',
+ 'stem':'orange4'}
+
+if len(sys.argv) < 2:
+ print('Need to specifiy a gene ID, e.g., AT1G19850.')
+ sys.exit()
+else:
+ query_tf = sys.argv[1]
+
+edge_dict, edge_dict_r = get_edge(edge_file)
+write_graphviz_file('result.gv', edge_dict, tissue_colour_dict, agi2name_dict, query_tf)
diff --git a/Code/make_graphviz_file3C.py b/Code/make_graphviz_file3C.py
new file mode 100644
index 0000000..125c462
--- /dev/null
+++ b/Code/make_graphviz_file3C.py
@@ -0,0 +1,273 @@
+# Usage: python make_graphviz_file3C.py AT1G19850
+#
+# Make plot: python make_graphviz_file3C.py AT1G65480 | sfdp -Goverlap=false -Tpdf -o result.flower.pdf result.flower.gv
+# unflatten -f -l 3 result.flower.gv | dot -Tsvg -o result.flower.svg
+#
+# Purpose: Generate result.txt for Graphviz software dot. The single
+# parameter AT1G19850 is a TF.
+# The query neighbours of the TF's neighbours are also shown.
+#
+# Created 10 July 2017, hui, slcu
+
+import random
+import numpy as np
+import sys
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+PERCENT = 1
+NUM_TARGETS_CUTOFF = 5
+
+def get_tf_tissue(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ head = lines[0].strip()
+ head_lst = head.split('\t')
+ head_lst = head_lst[1:] # remove TF
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ lst2 = lst[1:]
+ lst3 = [int(x) for x in lst2]
+ lst4 = np.array(lst3)
+ median_val = np.median(lst4)
+ tissue = []
+ for i in range(len(lst2)):
+ if int(lst2[i]) >= max(median_val, 1) or int(lst2[i]) >= NUM_TARGETS_CUTOFF:
+ tissue.append(head_lst[i])
+ tf = (lst[0].split())[0]
+ d[tf] = tissue # tf is assigned with a list of tissues, the tissue with node degree greater than median are selected.
+ return d
+
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+
+def get_edge(fname):
+ ''' Return d = {'flower':{'tf':[target1,target2, ...]}, 'seed':{}} '''
+ d = {}
+ d2 = {} # the actual correlation coefficient, absolute value
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not line.startswith('#'):
+ lst = line.split('\t')
+ target = (lst[0].split('_'))[0]
+ tf = (lst[1].split('_'))[0]
+ if not tf in d[tissue]:
+ d[tissue][tf] = [target]
+ else:
+ d[tissue][tf].append(target)
+
+ strength = abs(float(lst[2]))
+ if not tf in d2[tissue]:
+ d2[tissue][tf] = {target:strength}
+ else:
+ d2[tissue][tf][target] = strength
+
+ else:
+ tissue = get_tissue_from_fname(line)
+ d[tissue] = {}
+ d2[tissue] = {}
+ f.close()
+ return d, d2
+
+
+def in_same_tissue(source, target, node_dict):
+ return node_dict[source] == node_dict[target]
+
+
+def make_label(a, b):
+ if b == '.':
+ return a
+ else:
+ lst = b.split(';')
+ return a + '_' + lst[0]
+
+
+def has_predecessor(tf, d):
+ for k in d:
+ if tf in d[k] and k != tf:
+ return True
+ return False
+
+def get_num_successors(tf, d):
+ if not tf in d:
+ return 0
+ return len(d[tf])
+
+def get_shape(tf, d):
+ ''' d = {'tf':[target1, target2]} '''
+ p = has_predecessor(tf, d)
+ s = get_num_successors(tf, d)
+ if s > 0 and p:
+ return 'doublecircle'
+ if s > 0:
+ return 'ellipse' # regulator
+ if p:
+ return 'egg' # regulatee
+
+
+def get_color(tf, edge_dict, tissue):
+ #colours = ['darkolivegreen1', 'darkolivegreen2', 'darkolivegreen3', 'darkolivegreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4', 'darkgoldenrod', 'darkgoldenrod4']
+ #colours = ['snow', 'snow1', 'snow2', 'snow3', 'snow4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ colours = ['springgreen', 'springgreen1', 'springgreen2', 'springgreen3', 'springgreen4', 'gold', 'gold1', 'gold2', 'gold3', 'gold4']
+ d = {}
+ total = 0
+ for k in edge_dict:
+ n = get_num_successors(tf, edge_dict[k])
+ d[k] = n
+ total += n
+ #print('%s %d' % (k, n))
+ if total == 0:
+ return 'azure'
+ return colours[min(int(10 * 1.0 * d[tissue] / total), len(colours)-1)]
+
+
+def make_more_string(n, tissue, edge_dict, colour_dict, agi2name_dict, query_tf):
+ result = ''
+ d = edge_dict[tissue]
+ if n in d:
+ for target in d[n]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, tissue)
+ node_target = target + '_' + tissue + '.2'
+ if target != query_tf:
+ result += ' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[tissue], shape)
+
+ node_n = n + '_' + tissue
+ if random.uniform(0, 1) <= PERCENT:
+ result += ' \"%s\" -> \"%s\" [color=%s];\n' % (node_n, node_target, 'gold')
+
+ for tf in d:
+ if tf != query_tf:
+ if n in d[tf]: # n is successor
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, tissue)
+ node_tf = tf + '_' + tissue + '.2'
+ result += ' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[tissue], shape)
+
+ node_n = n + '_' + tissue
+ if random.uniform(0, 1) <= PERCENT:
+ result += ' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_n, 'red')
+
+ return result
+
+
+def write_graphviz_file(fname, edge_dict, colour_dict, agi2name_dict, query_tf):
+
+ f = open(fname, 'w')
+
+ graph_dict = {}
+ more = {}
+ for k in edge_dict:
+ graph_dict[k] = {'head':'', 'nodes':[], 'edges':[]}
+
+ for k in edge_dict: # k is tissue
+ neighbours = []
+ node_added_dict = {}
+ tissue_node = '%s_node' % (k)
+ graph_dict[k]['head'] = ''
+ d = edge_dict[k]
+ tf_lst = d.keys()
+ for tf in tf_lst:
+ node_tf = tf + '_' + k
+ if tf == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ if not tf in node_added_dict:
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, 'Query gene: '+ll, 'DeepSkyBlue', colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ for target in d[tf]:
+ ll = make_label(target, get_gene_name(target, agi2name_dict))
+ node_target = target + '_' + k
+ shape = get_shape(target, d)
+ color = get_color(target, edge_dict, k)
+ if random.uniform(0, 1) <= PERCENT:
+ if not target in node_added_dict:
+ neighbours.append(target)
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_target, ll, color, colour_dict[k], shape))
+ node_added_dict[target] = 'YES'
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'gold'))
+ else: # check if tf is a target of another tf
+ for target in d[tf]:
+ if target == query_tf:
+ ll = make_label(tf, get_gene_name(tf, agi2name_dict))
+ node_tf = tf + '_' + k
+ shape = get_shape(tf, d)
+ color = get_color(tf, edge_dict, k)
+ node_target = target + '_' + k
+ if random.uniform(0, 1) <= PERCENT:
+ if not tf in node_added_dict:
+ neighbours.append(tf)
+ graph_dict[k]['nodes'].append(' \"%s\" [label=\"%s\", fillcolor=%s, color=%s, shape=%s, style=filled];\n' % (node_tf, ll, color, colour_dict[k], shape))
+ node_added_dict[tf] = 'YES'
+ graph_dict[k]['edges'].append(' \"%s\" -> \"%s\" [color=%s];\n' % (node_tf, node_target, 'red'))
+ neighbours = list(set(neighbours))
+ more[k] = ''
+ for n in neighbours:
+ more[k] += make_more_string(n, k, edge_dict, colour_dict, agi2name_dict, query_tf)
+
+
+ for k in graph_dict:
+ if graph_dict[k]['nodes'] != []:
+ f = open(fname + '.' + k + '.gv', 'w')
+ s0 = 'digraph G {\n graph[splines=true, ranksep=3, fontname=Arial];\n node[fontname=Arial];\n'
+ s0 += graph_dict[k]['head']
+ for x in graph_dict[k]['nodes']:
+ s0 += x
+ for x in graph_dict[k]['edges']:
+ s0 += x
+ if k in more:
+ s0 += more[k]
+ s0 += '}\n'
+ f.write(s0)
+ f.close()
+
+# main
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file = 'result.skeleton.txt' # prepared by test_network4.py
+node_file = 'result.out.txt' # prepared by test_network4.py
+
+tissue_colour_dict = {
+ 'seedling':'greenyellow',
+ 'meristem':'skyblue4',
+ 'flower':'lightpink',
+ 'aerial':'cyan',
+ 'shoot':'forestgreen',
+ 'seed':'black',
+ 'leaf':'green',
+ 'root':'gold',
+ 'stem':'orange4'}
+
+
+
+if len(sys.argv) < 2:
+ sys.exit()
+else:
+ query_tf = sys.argv[1]
+
+#tf_tissue_dict = get_tf_tissue(node_file )
+edge_dict, edge_dict_r = get_edge(edge_file)
+write_graphviz_file('result', edge_dict, tissue_colour_dict, agi2name_dict, query_tf)
diff --git a/Code/make_parameter_bw.py b/Code/make_parameter_bw.py
new file mode 100644
index 0000000..bfb7074
--- /dev/null
+++ b/Code/make_parameter_bw.py
@@ -0,0 +1,127 @@
+# Usage: python make_parameter_bw.py
+# Edit the variable BW_PATHS, which is a file containing (almost all) paths to in-house bw files. Edit PARENT_DIR, which
+# will be used to make full path to the bw files. Amend name_map_dict.
+# Purpose
+# -------
+# Make a parameter file for buildCmatrix.py. For example:
+#
+# @C0001100007141
+# PROTEIN_ID:
+# PROTEIN_NAME:HTR13
+# DATA_NAME:2_71C_10_IP-HTR13-17c
+# DATA_FORMAT:bw
+# DESCRIPTION:in house ChIP data
+# LOCATION:/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data/2_71C/20150603_H3_2nd_rep_mapped_bw/ChIP-seq_h31_h33_col_rep1_20150607_max_fragment_500_bw/2_71C_10_IP-HTR13-17c_raw_trimmo_paired_truseq3-PE-2_2_10_5_1_bowtie2_TAIR10_ensembl_nomixed_sorted_rmdup_picard_genomenorm.bw
+# NOTE:
+#
+# 29 NOV 2016, hui, home
+
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+BW_PATHS = 'bwfiles_unique.txt' # Ideally, the input file should be sorted by library number, then sample number.
+PARENT_DIR = '/media/pw_synology3/PW_HiSeq_data/ChIP-seq/Mapped_data'
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+def get_library_id(s):
+ index = s.find('C') # in-house chip library id ends with a letter C
+ if index > 0:
+ return s[:index]
+ else:
+ return s
+
+def get_bw_file_name(lst):
+ for x in lst:
+ if '.bw' in x:
+ return x
+ return ''
+
+def get_sample_name(s):
+ index = s.find('_raw') # a bw file name contains _raw, get the part before _raw if _raw exists
+ if index > 0:
+ return s[:index]
+ else:
+ return s
+
+def get_sample_number(s):
+ index = s.find('_S') # sample number is proceeded by _S
+ if index > 0:
+ if s[index+2].isdigit() and s[index+3].isdigit(): # two digit sample number
+ return s[index+2:index+4]
+ else: # one digit sample number
+ return '0' + s[index+2]
+ return '25'
+
+def convert_name(s, d):
+ ''' convert s to gene id '''
+ if s.upper() in d:
+ return d[s.upper()]
+ else:
+ return ' '
+
+def get_gene_id_and_name(s, d):
+ ''' Return a dictionary value, protein name and protein id given a sample file name.'''
+ for k in sorted(d.keys(),reverse=True):
+ if k.isdigit() and k.lower() in s.lower(): # k is a number, e.g., 833
+ return (d[k], convert_name(d[k], agi2name_dict))
+ if k.lower() in s.lower():
+ return (k, convert_name(k, agi2name_dict))
+ return ('name_unknown', 'id_unknown')
+
+###
+
+# key is a name of sample, value is either empty if it is a protein name, or the protein name if the key is a database number, e.g., 833. If key is protein name, the program tries to find its gene id (AT...). If key is a database number, the program gets its protein name and then tries to find its gene id. If not found, PROTEIN_ID or PROTEIN_NAME will be assigned _unknown. So search _unknown to manually annotate PROTEIN_ID or PROTEIN_NAME.
+name_map_dict = {'HTA11':'', 'H3':'', 'HTR13':'', 'HSP70':'', 'KIN10':'', 'EPM1':'', 'ELF3':'', 'phyB':'', '833':'KIN10', 'hos':'', 'HOS':'', 'HOS1':'', 'LUX':'', 'YHB':'','HSF1':'','3129':'HSP90','838':'phyB', '1506':'HSF1', 'SUMO':'', 'TIR1':'', 'PIF4':'', 'PIF':'','H2A':'', 'H2AZ':'', 'MNase':'', '544':'ELF4', '834':'PIF4', '745':'ELF3', 'EC1_S1':'', 'EC1_S1':'ELF3', 'EC2_S2':'ELF3', '1166':'LUX', '1167':'LUX', '3239':'REV', '1281':'MPK6', '1278':'SEP3', '1279':'FD', '1280':'FD-like', '1283':'HSP70', '1284':'DELLA', '1762':'FT', 'LFY':'', 'TFL1':''}
+
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+f = open(BW_PATHS)
+
+d_cid = {}
+for line in f:
+ line = line.strip()
+ lst = line.split('/')
+ bwfile = get_bw_file_name(lst)
+ sample_name = get_sample_name(bwfile)
+ sample_no = get_sample_number(bwfile)
+ library_id = get_library_id(lst[0])
+ name, gid = get_gene_id_and_name(sample_name, name_map_dict)
+ path = os.path.join(PARENT_DIR, line)
+
+ if '2_' in library_id: # some in-house library id starts with 2_, indicating that it is a replicate.
+ index = library_id.find('_')
+ library_id = library_id[index+1:]
+ cid = '@C00011000%s%s' % (library_id.zfill(3), sample_no) # the highest digit of the last 9 digits is 1 to indicate that it is a replicate. library number, 3 digits. sample number, 2 digits.
+ else:
+ cid = '@C00010000%s%s' % (library_id.zfill(3), sample_no)
+
+ # handle duplicate library id
+ if not cid in d_cid:
+ d_cid[cid] = 1
+ else: # produce a new library id
+ d_cid[cid] += 1
+ count = 26
+ head = cid[-2:] # last two digits
+ while True:
+ new_cid = cid[:-2] + '%02d' % (count)
+ if count >= 100:
+ print('Error: count must be less than 100.')
+ sys.exit()
+ count += 1
+ if not new_cid in d_cid:
+ break
+ cid = new_cid
+ d_cid[cid] = 1
+
+ # print contents for the parameter file
+ print(cid)
+ print('PROTEIN_ID:%s' % (gid))
+ print('PROTEIN_NAME:%s' % (name))
+ print('DATA_NAME:%s' % (sample_name))
+ print('DATA_FORMAT:%s' % ('bw'))
+ print('DESCRIPTION:in house ChIP data')
+ print('LOCATION:%s' % (path))
+ print('NOTE:')
+ print('')
+
+f.close()
diff --git a/Code/make_parameter_dapseq2.py b/Code/make_parameter_dapseq2.py
new file mode 100644
index 0000000..946ab6f
--- /dev/null
+++ b/Code/make_parameter_dapseq2.py
@@ -0,0 +1,57 @@
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+DAPSEQ_DIR = '../Data/C/Mapped/dapseq/peaks'
+
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+
+
+def make_dapseq_dictionary(dirname):
+
+ d = {}
+
+ files = glob.glob(os.path.join(dirname, '*/*/*/*.narrowPeak'))
+
+ for f in files:
+ lst = f.split('/')
+ tf_name = lst[-3]
+ if not tf_name in d:
+ d[tf_name] = f
+ else:
+ print('ERROR: transcription factor name not unique.')
+ sys.exit()
+
+ return d
+
+
+d = make_dapseq_dictionary(DAPSEQ_DIR)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+
+count = 1
+for k, v in sorted(d.items(), key=operator.itemgetter(0)):
+ cid = 'C0002%09d' % (count)
+ count += 1
+ print('@%s' % (cid))
+ g = k.split('_')[0]
+ g = g.upper()
+
+
+ if g.startswith('AT'):
+ print('PROTEIN_ID:%s' % (g))
+ elif g in agi2name_dict and g != agi2name_dict[g]:
+ print('PROTEIN_ID:%s' % (agi2name_dict[g]))
+ else:
+ print('PROTEIN_ID:%s' % (g))
+
+ if g.startswith('AT') and g in agi2name_dict and g != agi2name_dict[g]:
+ print('PROTEIN_NAME:%s' % (agi2name_dict[g]))
+ else:
+ print('PROTEIN_NAME:%s' % (g))
+
+ print('DATA_NAME:%s' % (k))
+ print('DATA_FORMAT:%s' % ('narrowPeak'))
+ print('DESCRIPTION:dapseq')
+ print('LOCATION:%s' % (v))
+ print('NOTE:')
+ print('')
diff --git a/Code/make_parameter_dapseq3.py b/Code/make_parameter_dapseq3.py
new file mode 100644
index 0000000..405b3dc
--- /dev/null
+++ b/Code/make_parameter_dapseq3.py
@@ -0,0 +1,75 @@
+# Usage: python make_parameter_dapseq3.py
+# Because dap-seq dose not include all TFs, so include other TFs. The idea is that TFs within a same family are very conservative in binding.
+#
+
+import sys, glob, os, operator
+from geneid2name import make_gene_name_AGI_map_dict
+
+DAPSEQ_DIR = '/home/hui/network/dapseq_merged'
+MAP_FILE = '/home/hui/network/dapseq_merged/tffamily.simple.txt'
+
+def get_name(s):
+ lst = s.split('_')
+ result = []
+ for x in lst:
+ if x != 'tnt':
+ result.append(x)
+ return '_'.join(result)
+
+def make_dapseq_dictionary(dirname):
+
+ d = {}
+
+ files = glob.glob(os.path.join(dirname, '*.narrowPeak'))
+
+ for f in files:
+ lst = f.split('/')
+ tf_name = lst[-1].split('.')[0]
+ tf_name = get_name(tf_name)
+ if not tf_name in d:
+ d[tf_name] = f
+ else:
+ print('ERROR: transcription factor name not unique.')
+ sys.exit()
+
+ return d
+
+
+d = make_dapseq_dictionary(DAPSEQ_DIR)
+
+f = open(MAP_FILE)
+lines = f.readlines()
+f.close()
+
+# since MAP_FILE contain duplicate lines
+d_family = {}
+for line in lines:
+ line = line.strip()
+ lst = line.split()
+ tf = lst[0].upper()
+ tf_name = lst[1]
+ family = lst[2]
+ if not tf in d_family:
+ d_family[tf] = (tf_name, family)
+ else:
+ if family != d_family[tf][1]:
+ print('WARNING: %s conflict [%s %s]!' % (tf, family, d_family[tf][1]))
+
+count = 1
+for k in sorted(d_family.keys()):
+ g = k
+ gname = d_family[k][0]
+ key = d_family[k][1]
+ if key in d:
+ cid = 'C0003%09d' % (count)
+ count += 1
+ print('@%s' % (cid))
+ print('PROTEIN_ID:%s' % (g))
+ print('PROTEIN_NAME:%s' % (gname))
+ print('DATA_NAME:%s' % (gname))
+ print('DATA_FORMAT:%s' % ('narrowPeak'))
+ print('DESCRIPTION:inferred from dapseq')
+ #print('LOCATION:%s' % (os.path.join(DAPSEQ_DIR, d[key])))
+ print('LOCATION:%s' % (d[key]))
+ print('NOTE:')
+ print('')
diff --git a/Code/make_parameter_rnaseq.py b/Code/make_parameter_rnaseq.py
new file mode 100644
index 0000000..1fe9c6e
--- /dev/null
+++ b/Code/make_parameter_rnaseq.py
@@ -0,0 +1,163 @@
+# Usage: python make_parameter_rnaseq.py [id-list.txt] > parameter_for_buildRmatrix.txt
+# Edit QUANT_PATH, set NON_ZERO_RATIO.
+#
+# Purpose: automatically generate parameter_for_buildRmatrix.txt
+#
+# Update: 26 Feb 2017, slcu, hui
+# Update: 19 Sep 2019, slcu, hui [add read_ena_data_info_json]
+
+import sys, os, glob, json
+import fnmatch, re
+from configure import RNA_SEQ_INFO_FILE
+
+NON_ZERO_RATIO = 0.2 # omit *_quant.txt files with too many zeros.
+QUANT_PATH = ['../Data/R/Mapped/public', '../Data/R/Mapped/inhouse', '../Data/R/Mapped/other'] # places where all _quant.txt reside. _quant.txt in sub-directories will also be used.
+
+def get_quant_file_list(fname):
+ f = open(fname)
+ result = []
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def extract_id(s, src):
+ if src == 'inhouse' or src == 'other':
+ dirname = os.path.dirname(s)
+ lst = dirname.split('/')
+ parent_dir = lst[-1]
+ libno = filter(str.isdigit, parent_dir) # extrac library number
+ libno = libno.zfill(3)
+ sample_no = os.path.basename(s) # extract sample number
+ first_match = re.findall('_S\d+_', sample_no)[0]
+ sample_no = filter(str.isdigit, first_match).zfill(2)
+ return '0000' + libno + sample_no
+ if src == 'sra':
+ index = s.find('_quant')
+ if index > 0:
+ return s[:index]
+ else:
+ return 'NA'
+ return 'NA'
+
+def zfill2(s, n):
+ return s + 'X' * (n-len(s))
+
+def make_id(success_flag, lab_id, myid): # should also work for non-SRA id
+ if lab_id == 'SRR' or lab_id == 'ERR' or lab_id == 'DRR':
+ result = 'R' + success_flag + lab_id + zfill2(myid, 9)
+ else: # inhouse or other
+ result = 'R' + success_flag + lab_id + myid
+ return result
+
+def glob_files_include_path(directory, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for root, dirnames, filenames in os.walk(directory):
+ for filename in fnmatch.filter(filenames, pattern):
+ result.append(os.path.join(root, filename))
+ return result
+
+def glob_files_include_paths(directory_lst, pattern):
+ ''' return all file names (with paths) given directory and pattern '''
+ result = []
+ for directory in directory_lst:
+ result.extend(glob_files_include_path(os.path.abspath(directory), pattern))
+ return result
+
+def non_zero_ratio(fname):
+ non_zero_count = 0
+ total_count = 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ tpm = lst[3]
+ if not tpm == '0' and not 'nan' in tpm:
+ non_zero_count += 1
+ total_count += 1
+ return 1.0 * non_zero_count / total_count
+
+def get_rna_word_count(s):
+ ''' If s looks like description of an RNA-seq experiment, return 1 otherwise return 0. '''
+ count = 0
+ s = s.lower()
+ if 'rna-seq' in s or 'rnaseq' in s or 'transcriptome' in s or 'transcript' in s or 'mrna' in s:
+ count = 1
+ return count
+
+def read_ena_data_info_json(fname):
+ d = {}
+ with open(fname) as json_data:
+ json_dict = json.load(json_data)
+ for run_id in json_dict:
+ d[run_id] = 1
+ return d
+
+### main
+if not os.path.exists(RNA_SEQ_INFO_FILE):
+ print('make_parameter_rnaseq.py: you must provide %s. See parse_ena_xml.py on how to make it.' % (RNA_SEQ_INFO_FILE))
+ sys.exit()
+
+rna_data_info_dict = read_ena_data_info_json(RNA_SEQ_INFO_FILE)
+
+
+print('%%GENE_LIST=../Data/information/gene-list.txt\n%%HOLDON=NO\n') # head
+
+if len(sys.argv) > 1: # only get these samples specified in id-list.txt
+ quant_files = get_quant_file_list(sys.argv[1])
+else:
+ quant_files = glob_files_include_paths(QUANT_PATH, '*_quant.txt')
+
+
+include_count = 0
+total = 0
+already_added_dict = {}
+for fn in sorted(quant_files):
+ total += 1
+
+ nzr = non_zero_ratio(fn)
+ if nzr > NON_ZERO_RATIO: # files with too many zeros are ignored
+ fn2 = os.path.basename(fn)
+ data_name = 'None'
+ if 'inhouse' in fn:
+ myid = extract_id(fn, 'inhouse')
+ myid2 = make_id('0','001', myid) # lab id is 001, phil's lab
+ data_name = 'LAB001' + '_LIB' + myid[-5:]
+ desc = 'in-house'
+ include_me = True
+ elif 'public' in fn:
+ myid = extract_id(fn2, 'sra')
+ myid2 = make_id('0',myid[0:3], myid[3:])
+ data_name = myid
+ desc = 'SRA'
+ include_me = True if myid in rna_data_info_dict and rna_data_info_dict[myid] >= 0 else False # IMPORTANT
+ elif 'other' in fn:
+ myid = extract_id(fn, 'other')
+ # get lab id
+ dirname = os.path.dirname(fn)
+ lst = dirname.split('/')
+ lab_id = lst[-2]
+ myid2 = make_id('0', lab_id, myid) # lab id is 001, phil's lab
+ data_name = 'LAB' + lab_id + '_LIB' + myid[-5:]
+ desc = 'Other lab'
+ include_me = True
+ if include_me and not myid2 in already_added_dict:
+ print('@%s' % (myid2))
+ print('DATA_NAME:%s' % (data_name))
+ print('DATA_FORMAT:%s'% ('txt'))
+ print('DESCRIPTION:%s' % (desc))
+ print('LOCATION:%s' % (fn))
+ print('NOTE: non zero ratio is %4.2f' % (nzr))
+ print('')
+ include_count += 1
+ already_added_dict[myid2] = 'yes'
+ else:
+ #print('%s has too many zeroes. ignore.' % (fn))
+ pass
+
+print('#Done. Processed %d files. Included %d files.' % (total, include_count))
diff --git a/Code/make_target_tf.py b/Code/make_target_tf.py
new file mode 100644
index 0000000..1cb5147
--- /dev/null
+++ b/Code/make_target_tf.py
@@ -0,0 +1,290 @@
+# Usage: python make_target_tf.py parameter_for_net.txt > target_tf.txt
+#
+# Purpose: Make a target tfs file: each line is 'Target TF1 Condition.list'.
+# See ../Data/information/target_tf.txt for an example.
+#
+# Created on 17 JAN 2017, hui
+# Last modified on 16 Mar 2017, slcu, hui
+# Last modified on 5 Aug 2019, zjnu, hui
+# Last modified on 9 Oct 2019, zjnu, hui
+# Last modified on 22 Nov 2019, zjnu, hui [include binding information from two sources: target_tf.txt.20170629_143000 (results I made when I was at SLCU between 2016 and 2017) and target_tf_agris.txt]
+
+import sys, os, operator, itertools
+import numpy as np
+from param4net import make_global_param_dict
+
+####################################
+DATA_SYMBOL = '@'
+SIGNAL_INPUT_RATIO_TAU = 1.5
+####################################
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True) # largest numbers on the top
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+
+def read_info_data(fname):
+ ''' Read ChIP-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('make_target_tf: ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+
+def get_threshold2(lst, glb_param_dict):
+ x = np.array(lst)
+ x = x[x > 0]
+ max_num = int(glb_param_dict['MAX_NUM_TARGETS']) # max number of targets for a protein
+ percent = float(glb_param_dict['OVERFLOW_TARGETS_PERCENTAGE']) # if we have more targets than the max number, then include this percent of exceeding targets
+ n = len(x)
+ if n < max_num:
+ return x[-1]
+ else: # include some overflowing targets, but not all
+ overflow = n - max_num
+ keep = int(overflow * percent)
+ index = keep + max_num
+ return x[index]
+
+
+def convert_dict(d):
+ '''
+ d = {tf:{cond1:[target1, target2], cond2:[...]}}
+ result = {target:{tf:[c1,c2], tf:[c2,c3]}, ... }
+ '''
+
+ result = {}
+ for k in d: # k is tf
+ vd = d[k] # vd is something like {cond1:[target1, target2], cond2:[...]}
+ for c in vd:
+ lst = vd[c] # a list of targets
+ for x in lst: # x is a target
+ if not x in result:
+ result[x] = {k:[c]}
+ else:
+ if not k in result[x]:
+ result[x][k] = [c]
+ else:
+ result[x][k].append(c)
+
+ return result
+
+
+def get_tf(bind_dict, info_dict, input_dict, glb_param_dict):
+
+ tf_dict = {} # key is TF, value is a list of targets:[target1, target2, target3, ...]
+ #input_cond = input_dict['colid']
+ if len(input_dict) > 0:
+ input_cond = input_dict['colid'][0] # use the first column of INPUT matrix as input (improve). INPUT is used for format BW.
+ else:
+ input_cond = 'NA'
+
+ gene_id = np.array( bind_dict['rowid'] )
+
+ for c in bind_dict['colid']: # check each column (protein) in binding matrix. Find the protein's targets.
+ #print(c)
+ g2 = info_dict[c]['PROTEIN_ID'] # g2 is TF
+ bind_val = np.array( bind_dict['yy'][c] ) # a column of values
+
+ if info_dict[c]['DATA_FORMAT'].upper() == 'BW': # require more consideration in the future
+ input_val = np.array( input_dict['yy'][input_cond] )
+ index = np.logical_and( np.logical_and(input_val > 0, input_val < 10000), (bind_val / input_val) > SIGNAL_INPUT_RATIO_TAU) # ignore intensities greater than 10000 as these are definitely noise
+
+ elif info_dict[c]['DATA_FORMAT'].upper() == 'NARROWPEAK':
+ tau = get_threshold2(bind_dict['yy_sorted'][c], glb_param_dict)
+ index = bind_val >= tau
+ else:
+ print('make_target_tf: Data format %s not recognised. Only bw and narrowPeak are valid.' % (info_dict[c]['DATA_FORMAT']))
+ sys.exit()
+
+ target_gene_id = gene_id[index]
+ if g2 != '' and g2 != 'id_unknown':
+ if not g2 in tf_dict:
+ tf_dict[g2] = {c:list(target_gene_id)}
+ else:
+ tf_dict[g2][c] = list(target_gene_id)
+
+ # tf_dict is a bit complicated: key is TF, value is a dictionary
+ # where its key is condition, and value is a list of target genes.
+ # It basically say this TF under condition c binds to a list of
+ # target genes.
+ d = convert_dict(tf_dict)
+ return d
+
+
+def augment_dict(d, target, tf, cond_lst):
+ ''' Enlarge d '''
+ if not target in d:
+ d[target] = {tf:cond_lst}
+ else:
+ if not tf in d[target]:
+ d[target][tf] = cond_lst
+ else:
+ cond_lst.extend(d[target][tf])
+ d[target][tf] = sorted(list(set(cond_lst)))
+
+
+def target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict):
+ ''' Print lines in this format: target TF ChIP-seq conditions, e.g., ../Data/information/target_tf.txt
+ For example, 'AT1G01270 AT3G46640 C0001000008426 C0001000008427 C0001000008428'
+ '''
+ d = get_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
+ # d has the following format {target:{tf1:[c1,c2], tf2:[c2,c3]}, ... }
+
+ # augment d with information from ../Data/information/target_tf_agris.txt and ../Data/information/target_tf.txt.20170629_143000
+ if os.path.exists('../Data/information/target_tf_agris.txt'):
+ f = open('../Data/information/target_tf_agris.txt')
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) == 3:
+ target0 = lst[0]
+ tf0 = lst[1]
+ cond_lst0 = lst[2].split()
+ augment_dict(d, target0, tf0, cond_lst0)
+
+ if os.path.exists('../Data/information/target_tf.txt.20170629_143000'):
+ f = open('../Data/information/target_tf.txt.20170629_143000')
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) == 3:
+ target0 = lst[0]
+ tf0 = lst[1]
+ cond_lst0 = lst[2].split()
+ augment_dict(d, target0, tf0, cond_lst0)
+
+ for target in sorted(d.keys()):
+ tf_d = d[target]
+ if len(tf_d) > 0:
+ for tf in sorted(tf_d.keys()):
+ cond_lst = sorted(list(set(tf_d[tf])))
+ #if len(cond_lst) > 1 and 'C0000000000001' in cond_lst: # C0000000000001 is for binding evidence from agris
+ # cond_lst.remove('C0000000000001')
+ print('%s\t%s\t%s' % (target, tf, ' '.join(cond_lst) ) )
+
+
+########## main ##################################################
+param_file = sys.argv[1] # a single prameter file parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+#print('Read binding data ...')
+bind_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+
+if os.path.exists(glb_param_dict['INPUT_MATRIX']):
+ input_dict = read_matrix_data(glb_param_dict['INPUT_MATRIX']) # for comparing with bw files
+else:
+ input_dict = {}
+
+#print('Make target TF lines ...')
+target_tf(bind_dict, bind_info_dict, input_dict, glb_param_dict)
diff --git a/Code/make_target_tf_agris.py b/Code/make_target_tf_agris.py
new file mode 100644
index 0000000..c96c8fa
--- /dev/null
+++ b/Code/make_target_tf_agris.py
@@ -0,0 +1,39 @@
+# Make target_tf from AtRegNet.txt
+# Usage: python make_target_tf_agris.py > ../Data/information/target_tf_agris.txt
+
+fname = '../Data/information/AtRegNet.txt'
+
+sample_id = 'C0000000000001'
+
+f = open(fname)
+lines = f.readlines()
+f.close()
+
+d = {}
+count = 2
+duplicate = 0
+for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) >= 5:
+ tf0 = lst[1].upper().strip()
+ target0 = lst[4].upper().strip()
+ tf_lst = tf0.split('/')
+ target_lst = target0.split('/')
+ for tf in tf_lst:
+ for target in target_lst:
+ if tf.startswith('AT') and target.startswith('AT'):
+ k = target + '.' + tf
+ if k in d:
+ #print('Warning at line %d ' % (count))
+ duplicate += 1
+ else:
+ d[k] = [target, tf, sample_id]
+ count += 1
+
+
+print('pairs %d' % len(d))
+print('duplicate %d' % (duplicate))
+for k in sorted(d.keys()):
+ print('\t'.join(d[k]))
+
diff --git a/Code/make_upload_chip_parameter.py b/Code/make_upload_chip_parameter.py
new file mode 100644
index 0000000..e6cc4a8
--- /dev/null
+++ b/Code/make_upload_chip_parameter.py
@@ -0,0 +1,233 @@
+# Usage: python make_upload_chip_parameter.py
+#
+# Purpose: make a part of parameter_for_buildCmatrix.txt given the uploaded files in UPLOAD_DIR.
+# Each unique uploaded file will be assigned an ID.
+# The assigned ID starts with C0000, followed by 9 digits.
+# The following cases are handled: (i) the same bed file uploaded several times. the latest submission will be used.
+#
+# TBD: append to PARAMETER_FOR_BUILDCMATRIX
+# Created 20 July 2017, slcu, hui
+
+import os, sys, glob
+from datetime import datetime
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/upload/parameter_for_buildCmatrix.txt' # [change]
+UPLOAD_DIR = '../Data/upload/chipseq'
+
+INCLUDE_STAMP = 'BRAIN_HAS_INCLUDED_ME'
+
+def good_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#') and 'STATUS:' in line:
+ if 'SUCC' in line or 'UNKNOWN' in line:
+ return True
+ if 'FAIL' in line:
+ return False
+ return False
+
+def already_included(fname):
+ ''' If fname is already procesed, then its head line is marked with BRAIN_HAS_INCLUDED_ME'''
+ f = open(fname)
+ lines = f.readlines()
+ n = len(lines)
+ f.close()
+ for line in lines[0:min(n, 5)]: # the first five lines should include INCLUDE_STAMP if this file is already included.
+ line = line.strip()
+ if line.startswith('#') and INCLUDE_STAMP in line:
+ return True
+ return False
+
+
+def same_content(f1, f2):
+ ''' Test if two file, f1 and f2, have the same content. '''
+ if os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and os.path.exists(f2):
+ return False
+ if not os.path.exists(f1) and not os.path.exists(f2):
+ return False
+ if os.path.exists(f1) and os.path.exists(f2):
+ a = open(f1)
+ b = open(f2)
+ s1 = ''
+ for line in a:
+ line = line.strip()
+ if not line.startswith('#'): # don't include lines starting with '#'
+ s1 += line
+ s2 = ''
+ for line in b:
+ line = line.strip()
+ if not line.startswith('#'):
+ s2 += line
+ a.close()
+ b.close()
+ if s1 == s2:
+ return True
+ else:
+ return False
+
+def repeat(fname, d):
+ ''' Are there other files having the same content as fname? Return '' if no; otherwise return the conflicting file name. '''
+ for k in d:
+ if same_content(fname, d[k]['LOCATION']):
+ return k
+ return ''
+
+def update_dict(d, k, fname):
+ d[k] = make_chip_info_dict(fname)
+
+# def update_it(upload_dir, upload_dict):
+# id_lst = sorted(upload_dict.keys())
+# if id_lst != []:
+# last_id = id_lst[-1]
+# last_id_number = int(last_id[5:])
+# else:
+# last_id_number = 0
+# for fname in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*'))): # all uploaded BED files start with time stamp 20.....
+# if good_file(fname) and not already_included(fname):
+# #print(upload_dict)
+# k = repeat(fname, upload_dict)
+# if k == '':
+# k = '%d' % (last_id_number + 1)
+# k = 'C0000' + k.zfill(9)
+# upload_dict[k] = make_chip_info_dict(fname)
+# else:
+# update_dict(upload_dict, k, fname)
+# mark_it_as_included(fname)
+
+
+def make_chip_info_dict(fname):
+ ''' Return a dictionary given a user submitted file. '''
+ d = {'PROTEIN_ID':'', 'PROTEIN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'narrowPeak', 'DESCRIPTION':'user upload', 'LOCATION':'', 'NOTE':''}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line.startswith('#'):
+ s = line[(line.rfind('#')+1):]
+ s = s.strip()
+ lst = s.split(':')
+ k = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k] = v
+
+ d['DATA_NAME'] = os.path.basename(fname)
+ d['LOCATION'] = os.path.abspath(fname)
+ d['NOTE'] = 'update:%s' % datetime.now().strftime('%Y%m%d')
+ return d
+
+def mark_it_as_included(fname):
+ ''' Prepend a head line including INCLUDE_STAMP'''
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(fname, 'w')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ f.write('# %s %s\n' % (INCLUDE_STAMP, curr_time) + s)
+ f.close()
+
+def make_string(d):
+ s = ''
+ for k in sorted(d.keys()):
+ s += '@%s\n' % k
+ s += 'PROTEIN_ID:%s\n' % d[k]['PROTEIN_ID']
+ s += 'PROTEIN_NAME:%s\n' % d[k]['PROTEIN_NAME']
+ s += 'DATA_NAME:%s\n' % d[k]['DATA_NAME']
+ s += 'DATA_FORMAT:narrowPeak\n'
+ s += 'DESCRIPTION:%s\n' % d[k]['DESCRIPTION']
+ s += 'LOCATION:%s\n' % d[k]['LOCATION']
+ s += 'NOTE:%s\n\n' % d[k]['NOTE']
+ return s
+
+def md(fname):
+ ''' Return a dictionary containing the paramter information. '''
+ d = {}
+ if not os.path.exists(fname):
+ return {}
+ else:
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line != '' and line.startswith('@'):
+ k = line[1:]
+ d[k] = {}
+ elif line != '':
+ lst = line.split(':')
+ k2 = lst[0].strip()
+ v = line[(line.find(':')+1):]
+ d[k][k2] = v
+ return d
+
+def is_empty(fname):
+ ''' Return True if fname has no content. '''
+ if os.path.exists(fname):
+ f = open(fname)
+ s = f.read()
+ f.close()
+ return s.strip() == ''
+ return False
+
+def get_largest_upload_chip_id(fname):
+ lst = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@C0000'):
+ lst.append(int(line[2:]))
+ f.close()
+ if lst != []:
+ return max(lst)
+ else:
+ return 0
+
+def make_upload_dict(param_fname, included_path):
+ d = {}
+ i = get_largest_upload_chip_id(param_fname) + 1 # staring id
+ for fn in sorted(glob.glob(os.path.join(UPLOAD_DIR, '20*.*')), reverse=False): # newer files are considered first
+ k = 'C0000' + ('%d' % (i)).zfill(9)
+ if good_file(fn) and not already_included(fn) and repeat(fn, d) == '':
+ d[k] = make_chip_info_dict(fn)
+ i += 1
+ if good_file(fn):
+ mark_it_as_included(fn)
+ cmd = 'mv %s %s' % (fn, included_path)
+ os.system(cmd)
+
+ return d
+
+def append_to_file(fname, s):
+ f = open(fname, 'a')
+ f.write('\n' + s + '\n')
+ f.close()
+
+def make_directory(my_dir):
+ if not os.path.exists(my_dir):
+ os.makedirs(my_dir)
+
+def make_copy(fname):
+ if os.path.exists(fname):
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+ new_fname = fname + '.copy.%s' % (curr_time)
+ f = open(fname)
+ s = f.read()
+ f.close()
+ f = open(new_fname, 'w')
+ f.write(s)
+ f.close()
+
+## main
+included_path = os.path.join(UPLOAD_DIR, 'included')
+make_directory(included_path)
+upload_dict = make_upload_dict(PARAMETER_FOR_BUILDCMATRIX, included_path)
+s = make_string(upload_dict)
+if s != '':
+ # before changing PARAMETER_FOR_BUILDCMATRIX, make a copy of it
+ make_copy(PARAMETER_FOR_BUILDCMATRIX)
+ append_to_file(PARAMETER_FOR_BUILDCMATRIX, s)
diff --git a/Code/merge_edges.py b/Code/merge_edges.py
new file mode 100644
index 0000000..ef870fb
--- /dev/null
+++ b/Code/merge_edges.py
@@ -0,0 +1,190 @@
+# Purpose: When edges.txt contains multiple lines representing the
+# same edge, choose only one edge.
+#
+# Usage: python merge_edges.py
+#
+# This script is used to produce the edges.txt for the brain
+# web application. It searches in EDGE_POOL_DIR for edge files
+# (with 10 columns) from many sources, most likely with
+# duplicated edges. It removes duplication and computes
+# strength for each edge.
+#
+# Note: make sure fname is edges.txt Rationale: to save place, I am no
+# longer going to use a full list of RNA-seq experiment IDs in the
+# fifth column. Use a number instead. This number is the length of
+# RNA-seq experiment IDs. If no IDs are available, this number is 1.
+# However, I am still going to keep a full list of ChIP-seq experiment
+# IDs (the sixth column).
+#
+# Created on 3 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+# Last modified on 5 August 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+import os, operator, sys, math, datetime, glob
+from configure import EDGE_POOL_DIR, MERGED_EDGE_FILE
+
+def get_number_of_RNAseq_ids(s):
+ if s == '.':
+ return 1
+ if s.isdigit():
+ return int(s)
+ return len(s.split())
+
+
+def add_dashes_to_date(s):
+ return s[:4] + '-' + s[4:6] + '-' + s[6:]
+
+
+def split_id_and_name(s):
+ lst = s.split()
+ result = lst[0]
+ if lst[0] != lst[1]:
+ result = lst[0] + ' (' + lst[1] + ')'
+ return result
+
+
+def make_html_list(s):
+ if s.strip() == '':
+ return 'None'
+ result = '<ul>'
+ for method in s.split(','):
+ result += '<li>%s</li>' % (method)
+ result += '</ul>'
+ return result
+
+
+def make_html_page(lst, fname):
+ tf = lst[1].split()[0] # ID only, no name
+ target = lst[0].split()[0]
+
+ head = '<title>%s</title>\n' % ('Target is ' + lst[0] + ' and TF is ' + lst[1])
+ head += '<link href="./c3.min.css" rel="stylesheet" />\n<script src="./d3.min.js"></script>\n<script src="./c3.min.js"></script>\n<script src="./scatterplot.js"></script>'
+ s = '<html>'
+ s += '<head>%s</head>\n' % (head)
+ body = '<p>TF is %s. </p>\n' % (split_id_and_name(lst[1]))
+ body += '<p>Target is %s. </p>\n' % (split_id_and_name(lst[0]))
+ body += '<p>Association strength: %s.</p>\n' % (lst[8])
+ body += '<p>Edge made on %s. </p>\n' % (add_dashes_to_date(lst[7]))
+ body += '<p>Methods: %s</p>\n' % (make_html_list(lst[9]))
+ body += '<p>Evidence of binding: %s.</p>\n' % (lst[5] if lst[5] != '.' else 'TBA')
+ body += '<a id="myLink" href="javascript:void(0);" onclick="drawScatterPlot(\'json/%s.json\', \'json/%s.json\', \'rnaseq_info_database.json\', [\'.\']);">Click for gene expression scatter plot</a>\n' % (tf, target)
+ body += '<p>For more detailed analysis, <a href="gene-expression-level-scatterplot-by-XuMengqi.zip">download</a> our gene expression scatter plotting tool. No installation is required. Input data: <a href="json/%s.json">TF gene expression</a> <a href="json/%s.json">Target gene expression</a> <a href="rnaseq_info_database.json">RNA-seq annotation</a></p>\n' % (tf, target)
+ body += '<p id="chart"></p>\n'
+## if 'AT2G44304' in lst[0] and 'AT2G24700' in lst[1]:
+## print(lst)
+## sys.exit()
+
+ s += '<body>%s</body>\n' % (body)
+ s += '</html>'
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+def compute_time_difference_in_days(t1, t2):
+ ''' t1 and t2 has this format: yyyymmdd. '''
+ if not t1.isnumeric() and length(t1) != 8:
+ raise Exception('t1 format wrong in compute_time_difference_in_days.')
+ if not t2.isnumeric() and length(t2) != 8:
+ raise Exception('t2 format wrong in compute_time_difference_in_days.')
+
+ t1 = datetime.date(int(t1[:4]), int(t1[4:6]), int(t1[6:]))
+ t2 = datetime.date(int(t2[:4]), int(t2[4:6]), int(t2[6:]))
+ return (t1 - t2).days
+
+
+def make_new_edge(lst_tuple):
+ lst = sorted(lst_tuple, reverse=True, key = lambda x: abs(float(x[2]))) # sort according to absolute value of score
+ best_edge = list( lst[0] )
+
+ # see section 'Ranking edges using frecency' in the brain documentation
+ F = len(lst_tuple)
+
+ RN_lst = []
+ r_lst = []
+ most_recent_edge_date = '00000000'
+ method_or_tissue = []
+ cids = ''
+ for t in lst:
+ r_lst.append( abs(float(t[2])) )
+ rids = t[4]
+ if t[5] > cids:
+ cids = t[5]
+ RN_lst.append( get_number_of_RNAseq_ids(rids) )
+ date = t[7]
+ if date > most_recent_edge_date:
+ most_recent_edge_date = date
+ method_or_tissue.append(t[9])
+ S = 365 * 10
+ curr_date = datetime.datetime.now().strftime('%Y%m%d')
+ #time_diff = int(most_recent_edge_date) - int(curr_date)
+ time_diff = compute_time_difference_in_days(most_recent_edge_date, curr_date)
+ strength = sum(r_lst)/len(r_lst) * math.log(sum(RN_lst)/len(RN_lst)+1, 10) * math.log(F+1, 2) * math.exp(time_diff/S)
+ best_edge[4] = '%d' % max(RN_lst)
+ best_edge[5] = cids
+ best_edge[8] = '%.2f' % strength
+ best_edge[9] = ','.join(sorted(list(set(method_or_tissue)))) # unique methods or tissues, in string format
+
+## if 'AT2G44304' in best_edge[0] and 'AT2G24700' in best_edge[1]:
+## print(strength)
+## print(best_edge)
+## sys.exit()
+
+ return best_edge
+
+
+
+##main
+
+d = {}
+duniq = {}
+for fname in sorted(glob.glob(os.path.join(EDGE_POOL_DIR, '*.*'))):
+ print('[merge_edges.py]: including %s.' % (fname))
+ f = open(fname)
+
+ for line in f:
+ line = line.strip()
+ if len(line.split('\t')) == 10 and not line in duniq:
+ duniq[line] = 1
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ score = lst[2]
+ type_of_score = lst[3]
+ rids = lst[4]
+ cids = lst[5]
+ ll = lst[6]
+ date = lst[7]
+ strength = lst[8]
+ method_or_tissue = lst[9]
+
+ key = target + tf
+ t = (target, tf, score, type_of_score, rids, cids, ll, date, strength, method_or_tissue)
+
+ if not key in d:
+ d[key] = [t]
+ else:
+ d[key].append(t)
+
+ f.close()
+
+
+
+# make html pages
+folder_path = '../Data/temp/html_edges'
+if not os.path.isdir(folder_path):
+ os.mkdir(folder_path)
+
+
+print('[merge_edges.py]: Make text edge file...')
+fout = open(MERGED_EDGE_FILE, 'w')
+for k in d:
+ lst = make_new_edge(d[k])
+ fout.write('\t'.join(lst) + '\n')
+fout.close()
+
+
+print('[merge_edges.py]: Make html edge files. May take a while...')
+for k in d:
+ lst = make_new_edge(d[k])
+ pagename = lst[1].split()[0] + '_' + lst[0].split()[0] + '_0.html' # TF_Target.html
+ make_html_page(lst, folder_path + '/' + pagename)
diff --git a/Code/param4net.py b/Code/param4net.py
new file mode 100644
index 0000000..a68e7e3
--- /dev/null
+++ b/Code/param4net.py
@@ -0,0 +1,25 @@
+# Purpose: refactoring.
+# Create on 10 Aug 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+GLB_PARAM_SYMBOL = '%%'
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {'GENE_LIST':'', 'HIGH_PRIORITY_GENE':'', 'BINDING_MATRIX':'', 'INPUT_MATRIX':'', 'EXPRESSION_MATRIX':'', 'BINDING_INFO':'', 'EXPRESSION_INFO':'', 'RESEARCH_KEYWORDS':'', 'USER_CONDITION_LIST':[], 'LOOK_FOR_POS_CORRELATION':'NO', 'LOOK_FOR_NEG_CORRELATION':'NO', 'MAKE_PLOT':'NO', 'TWO_WAY':'YES', 'THREE_WAY':'NO', 'TARGET_RANGE':'1000', 'FC':'2.0', 'PVALUE':'0.0001', 'QVALUE':'0.01', 'CHRINFO':{'1':30427671, '2':19698289, '3':23459830, '4':18585056, '5':26975502, 'Mt':366924, 'Pt':154478}, 'SELECT_POINTS_DIAGONAL_MAX_DIFF':0.25} # change
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
diff --git a/Code/parse_ena_xml.py b/Code/parse_ena_xml.py
new file mode 100644
index 0000000..b9d6905
--- /dev/null
+++ b/Code/parse_ena_xml.py
@@ -0,0 +1,364 @@
+# Usage: python parse_ena_xml.py > rnaseq_info_database.txt
+#
+# Search in this script for 'd_run', 'd_sample', 'd_experiment' and
+# 'd_study', and set their input files. The input files are generated
+# by download_ena_metadata.py (except for d_sample). It also
+# generates a json file called info_database.json, for displaying
+# experimental information in the scatterplot. If the input files are
+# for RNA-seq data, rename info_database.json to
+# rnaseq_info_database.json and move it to Data/information. Also move
+# rnaseq_info_database.txt to Data/information. They are used by
+# html_network.py.
+#
+# Purpose: Get description for RNA-seq data, one for each SRA Run ID.
+# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each
+# line in rnaseq_info_database.txt contains information for a run id.
+#
+# NOTE: you might encounter UnicideEncodeError when running the
+# program. To avoid that, first type this command:
+# export PYTHONIOENCODING=UTF-8.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+# 20 Apr 2017, slcu, hui
+# 30 May 2017, slcu, hui
+# 01 Jun 2017, slcu, hui [added a column sample_id]
+# 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.]
+
+import os, json, re, operator
+import xml.etree.ElementTree
+import sys
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_SraRunTable(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'):
+ lst = line.split('\t')
+ acc = lst[17]
+ if not acc in d:
+ d[acc] = {}
+ d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.'
+ d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.'
+ d[acc]['study_id'] = lst[19] if lst[19] != '' else '.'
+ d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.'
+ d[acc]['alias'] = lst[11] if lst[11] != '' else '.'
+ d[acc]['title'] = lst[20] if lst[20] != '' else '.'
+ return d
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+
+def parse_study(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('PROJECT'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+ if acc != None:
+ d2['secondary_id'] = acc.text
+ else:
+ d2['secondary_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None:
+ d2['title'] = title.text
+
+ run_id = ''
+ for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-')
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_sample(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('SAMPLE'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
+ if acc != None:
+ d2['external_id'] = acc.text
+ else:
+ d2['external_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ tissue_type = ''
+ for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'):
+ if i != None and i.text != None:
+ tissue_type += i.text + ' '
+ d2['tissue'] = tissue_type.strip()
+
+ run_id = ''
+ for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_experiment(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('EXPERIMENT'):
+ d2 = {}
+ d2['primary_id'] = c.get('accession')
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ desc = c.find('./DESIGN/DESIGN_DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ run_id = ''
+ for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'}
+ if w in d:
+ return d[w]
+ return w
+
+def get_tissue(s):
+ ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
+
+
+ lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
+
+ # build a count dictionary, where key is a word
+ d = {}
+ s = s.lower()
+ wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words
+ for w in wlst:
+ if w in lst:
+ w2 = get_singular_form(w)
+ if not w2 in d:
+ d[w2] = 1
+ else:
+ d[w2] += 1
+ if len(d) == 0:
+ return 'unknown'
+
+ tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ result = ''
+ for t in tlst:
+ result += '%s(%d);' % (t[0], t[1])
+ return result.rstrip(';')
+
+
+## main
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_sample = parse_sample('../Data/information/ena_sample.xml') # SAMPLE. We can download RUN, STUDY, EXPERIMENT using download_ena_metadata.py, but not for SAMPLE (weired). So we need manually download ena_sample.xml. Enter http://www.ebi.ac.uk/ena/data/search?query=arabidopsis%20thaliana, click Sample (31,042) in the left panel of the displayed page, then click XML link in the right panel. The XML link is very small, so take time to find it or search for XML.
+d_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
+d_run2 = parse_SraRunTable('../Data/information/SraRunTable_Ath_Tax3702.txt') # Go to https://www.ncbi.nlm.nih.gov/sra. Type (arabidopsis thaliana) AND "Arabidopsis thaliana"[orgn:__txid3702]. Click "Send results to Run selector". Click "RunInfo Table". Save SraRunTable.txt
+d_study = parse_study('../Data/information/ena_rnaseq_read_study.xml') # STUDY
+d_experiment = parse_experiment('../Data/information/ena_rnaseq_read_experiment.xml') # EXPERIMENT
+
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+print('%s' % ('\t'.join(['run_id', 'sample_id', 'experiment_id', 'study_id', 'study_id_PRJ', 'title', 'alias', 'description']))) # description comes from three sources, STUDY, SAMPLE and EXPERIMENT
+d_run_keys = d_run.keys()
+d_run_keys.extend(d_run2.keys())
+d_run_keys = list(set(d_run_keys))
+for k in sorted(d_run_keys):
+ lst = []
+ lst.append(k)
+ if k in d_run:
+ if k in d_sample:
+ if d_sample[k]['external_id'] != '.':
+ lst.append(d_sample[k]['external_id'] + '...' + d_sample[k]['tissue'])
+ else:
+ lst.append(d_sample[k]['primary_id'] + '...' + d_sample[k]['tissue'])
+ else:
+ lst.append('.')
+ lst.append( d_run[k]['experiment_id'])
+ lst.append( d_run[k]['study_id'] )
+ if k in d_study:
+ lst.append( d_study[k]['primary_id'] )
+ else:
+ lst.append( '.' )
+ lst.append( d_run[k]['title'] )
+ lst.append( d_run[k]['alias'] )
+
+ s = '' # description string
+
+ if k in d_study:
+ s += ' <br><br>[Study title] ' + d_study[k]['title'] + ' <br><br>[Study description] ' + d_study[k]['description'] # <br> is used for breaking lines in html
+
+ if k in d_sample:
+ s += ' <br><br>[Sample title] ' + d_sample[k]['title'] + ' <br><br>[Sample description] ' + d_sample[k]['description']
+
+ if k in d_experiment:
+ s += ' <br><br>[Experiment title] ' + d_experiment[k]['title'] + ' <br><br>[Experiment description] ' + d_experiment[k]['description']
+
+ if s == '':
+ s = '.'
+
+ lst.append(s)
+ elif k in d_run2:
+ lst.append(d_run2[k]['sample_id'])
+ lst.append(d_run2[k]['experiment_id'])
+ lst.append(d_run2[k]['study_id'])
+ lst.append(d_run2[k]['study_id_PRJ'])
+ lst.append(d_run2[k]['title'])
+ lst.append(d_run2[k]['alias'])
+ lst.append('.')
+
+ print('%s' % ('\t'.join(lst)))
+
+# make a json file as well. this file is used to display rna-seq information in scatterplots.
+json_dict = {}
+for k in sorted(d_run_keys):
+ if k in d_run:
+ s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:'
+ if k in d_study:
+ s += ' ' + d_study[k]['title'] + ' ' + d_study[k]['description']
+ if k in d_sample:
+ s += ' ' + d_sample[k]['title'] + ' ' + d_sample[k]['description']
+ if k in d_experiment:
+ s += ' ' + d_experiment[k]['title'] + ' ' + d_experiment[k]['description']
+
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ elif k in d_run2:
+ s = d_run2[k]['title'] + ' ' + d_run2[k]['alias']
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ json_dict[k] = d
+
+fname = '../Data/information/rnaseq_info_database.json'
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname))
diff --git a/Code/parse_ena_xml_test.py b/Code/parse_ena_xml_test.py
new file mode 100644
index 0000000..c12c580
--- /dev/null
+++ b/Code/parse_ena_xml_test.py
@@ -0,0 +1,307 @@
+# Usage: python parse_ena_xml.py > rnaseq_info_database.txt
+#
+# Search in this script for 'd_run', 'd_sample', 'd_experiment' and
+# 'd_study', and set their input files. The input files are generated
+# by download_ena_metadata.py (except for d_sample). It also
+# generates a json file called info_database.json, for displaying
+# experimental information in the scatterplot. If the input files are
+# for RNA-seq data, rename info_database.json to
+# rnaseq_info_database.json and move it to Data/information. Also move
+# rnaseq_info_database.txt to Data/information. They are used by
+# html_network.py.
+#
+# Purpose: Get description for RNA-seq data, one for each SRA Run ID.
+# Make rnaseq_info_database.txt and rnaseq_info_database.json. Each
+# line in rnaseq_info_database.txt contains information for a run id.
+#
+# NOTE: you might encounter UnicideEncodeError when running the
+# program. To avoid that, first type this command:
+# export PYTHONIOENCODING=UTF-8.
+#
+# 22 Feb 2017, slcu, hui
+# 12 Apr 2017, slcu, hui
+# 20 Apr 2017, slcu, hui
+# 30 May 2017, slcu, hui
+# 01 Jun 2017, slcu, hui [added a column sample_id]
+# 19 Jun 2017, slcu, hui [added SraRunTable_Ath_Tax3702.txt in d_run2. Search d_run2 for how to get SraRunTable_Ath_Tax3702.txt.]
+
+import os, json, re, operator
+import xml.etree.ElementTree
+import sys
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_SraRunTable(fname):
+ d = {}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if not line.startswith('#') and not line.startswith('Assay_Type_s') and line.lower().startswith('rna-seq'):
+ lst = line.split('\t')
+ acc = lst[17]
+ if not acc in d:
+ d[acc] = {}
+ d[acc]['experiment_id'] = lst[6] if lst[6] != '' else '.'
+ d[acc]['sample_id'] = (lst[4] + '...' + lst[18] + ' ' + lst[20]) if lst[4] != '' else '.'
+ d[acc]['study_id'] = lst[19] if lst[19] != '' else '.'
+ d[acc]['study_id_PRJ'] = lst[3] if lst[3] != '' else '.'
+ d[acc]['alias'] = lst[11] if lst[11] != '' else '.'
+ d[acc]['title'] = lst[20] if lst[20] != '' else '.'
+ return d
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+
+def parse_study(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('PROJECT'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/SECONDARY_ID')
+ if acc != None:
+ d2['secondary_id'] = acc.text
+ else:
+ d2['secondary_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None:
+ d2['title'] = title.text
+
+ run_id = ''
+ for i in c.findall('./PROJECT_LINKS/PROJECT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-')
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_sample(fname):
+ d = {}
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+
+ for c in root.findall('SAMPLE'):
+ d2 = {}
+ acc = c.find('./IDENTIFIERS/EXTERNAL_ID')
+ if acc != None:
+ d2['external_id'] = acc.text
+ else:
+ d2['external_id'] = '.'
+ d2['primary_id'] = c.get('accession')
+
+ desc = c.find('DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ tissue_type = ''
+ for i in c.findall('./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE/VALUE'):
+ if i != None and i.text != None:
+ tissue_type += i.text + ' '
+ d2['tissue'] = tissue_type.strip()
+
+ run_id = ''
+ for i in c.findall('./SAMPLE_LINKS/SAMPLE_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def parse_experiment(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('EXPERIMENT'):
+ d2 = {}
+ d2['primary_id'] = c.get('accession')
+
+ title = c.find('TITLE')
+ d2['title'] = 'None'
+ if title != None and title.text != None:
+ d2['title'] = title.text
+
+ desc = c.find('./DESIGN/DESIGN_DESCRIPTION')
+ d2['description'] = 'None'
+ if desc != None and desc.text != None:
+ d2['description'] = desc.text
+
+ run_id = ''
+ for i in c.findall('./EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RR' in s:
+ run_id = s;
+ break
+ lst = run_id.split(',')
+ for x in lst:
+ lst2 = x.split('-') # e.g., SRR520490-SRR520491
+ if len(lst2) == 1 and lst2[0] != '':
+ k = lst2[0]
+ d[k] = d2 # k is run id, such as SRR, ERR or DRR
+ elif len(lst2) == 2:
+ ss = lst2[0]
+ ee = lst2[1]
+ first_three_letters = ss[0:3]
+ sz = len(ss) - 3
+ ss_t = int(ss[3:])
+ ee_t = int(ee[3:])
+ for j in range(ss_t, ee_t+1, 1):
+ k = first_three_letters + str(j).zfill(sz)
+ d[k] = d2
+ return d
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum'}
+ if w in d:
+ return d[w]
+ return w
+
+def get_tissue(s):
+ ''' Extract tissue name from s. s may contain several tissue names, return them ordered by frequency. '''
+
+
+ lst = ['seedling', 'seedlings', 'root', 'roots', 'leaves', 'leaf', 'flower', 'flowers', 'floral', 'shoot', 'shoots', 'apex', 'apices', 'stamen', 'stem', 'stems', 'seed', 'seeds', 'petal', 'petals', 'sepal', 'sepals', 'embryo', 'embryos', 'embryonic', 'cotyledon', 'cotyledons', 'xylem', 'hair', 'hairs', 'phloem', 'pericycle', 'primordia', 'columella', 'cortex', 'meristem', 'meristems', 'cambium', 'epidermis', 'epidermal', 'phloem', 'mesophyll', 'apical', 'lateral', 'intercalary', 'parenchyma', 'collenchyma', 'sclerenchyma', 'bud', 'buds', 'endosperm', 'colletotrichum', 'stele', 'vacuoles', 'vacuole', 'vacuolar', 'tip', 'tips', 'pollen', 'hypocotyl', 'hypocotyls', 'tube', 'tubes', 'basal', 'stomatal', 'stomata', 'surface', 'progeny', 'ovules', 'carpel', 'carpels', 'gynoecium', 'pistil', 'pistils', 'anthers', 'anther', 'endodermis', 'dicotyledonous', 'hyphae', 'adabaxial', 'axial', 'cauline', 'rosette', 'pedicle', 'pedicel', 'inflorescence', 'petiole', 'lamina', 'vascular', 'bundle', 'sheath'] # possible tissue names, lower case. refer to /home/hui/network/test/rnaseq.word.count.txt for distinct words in rna seq. rnaseq.word.count.txt is generated by /home/hui/network/test/count_word.py
+
+ # build a count dictionary, where key is a word
+ d = {}
+ s = s.lower()
+ wlst = re.sub("[^\w]", " ", s).split() # a list of words in s. http://stackoverflow.com/questions/6181763/converting-a-string-to-a-list-of-words
+ for w in wlst:
+ if w in lst:
+ w2 = get_singular_form(w)
+ if not w2 in d:
+ d[w2] = 1
+ else:
+ d[w2] += 1
+ if len(d) == 0:
+ return 'unknown'
+
+ tlst = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
+ result = ''
+ for t in tlst:
+ result += '%s(%d);' % (t[0], t[1])
+ return result.rstrip(';')
+
+
+## main
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_run = parse_run('ena_rnaseq_read_run.xml') # RUN
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d_run_keys = d_run.keys()
+d_run_keys = list(set(d_run_keys))
+for k in sorted(d_run_keys):
+ lst = []
+ lst.append(k)
+ if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower():
+ lst.append( d_run[k]['experiment_id'])
+ lst.append( d_run[k]['study_id'] )
+ lst.append( d_run[k]['title'] )
+ lst.append( d_run[k]['alias'] )
+ print('\t'.join(lst))
+
+# make a json file as well. this file is used to display rna-seq information in scatterplots.
+json_dict = {}
+for k in sorted(d_run_keys):
+ if k in d_run and 'illumina hiseq' in d_run[k]['title'].lower() and 'rna-seq' in d_run[k]['title'].lower():
+ s = 'Title: ' + d_run[k]['title'] + '. Alias: ' + d_run[k]['alias'] + '. More info:'
+ s = s.strip()
+ d = {}
+ d['tissue'] = get_tissue(s)
+ d['detail'] = s[0:min(MAX_DESCRIPTION_LENGTH, len(s))] + ' ...'
+
+ json_dict[k] = d
+
+fname = 'rnaseq_info_database.json'
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+#sys.stderr.write('Check %s. Use this file to display RNA-seq information in the scatterplots. Copy it to Data/information and rename it to rnaseq_info_database.json.\n' % (fname))
diff --git a/Code/prepare_gene_file.py b/Code/prepare_gene_file.py
new file mode 100644
index 0000000..febcbef
--- /dev/null
+++ b/Code/prepare_gene_file.py
@@ -0,0 +1,79 @@
+# Usage: python prepare_gene_file.py all-ath-gene-position.txt > gene_file.txt
+# all-ath-gene-position contains all gene IDs in the genome, and their positions, orientation, etc, in BED format.
+# See ../Data/information/all-ath-gene-position.txt
+# Purpose: get gene name and gene annotation
+# 2 JAN 2017 hui SLCU
+
+import sys
+import os
+
+###################################################################################
+GENE_DESCRIPTION = '../Data/information/gene_description_20140101.txt'
+AGI_TO_GENE_NAMES = '../Data/information/AGI-to-gene-names.txt'
+###################################################################################
+
+
+def get_description(x, d):
+ result = ''
+ if x in d:
+ result = '\t' + d[x]
+ else:
+ result = '\tNot Found'
+ return result
+
+
+def make_AGI_to_gene_name_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ agi = lst[0]
+ name = lst[1]
+ if not agi in d:
+ d[agi] = name
+ else:
+ d[agi] += ';' + name
+ f.close()
+ return d
+
+
+## main
+
+# make a dictionary of gene description
+f0 = open(GENE_DESCRIPTION)
+d = {}
+for line in f0:
+ line = line.strip()
+ lst = line.split('\t')
+ id = lst[0]
+ id = id[0:9] # AGI id, omit .1, .2, .3, etc
+ s = '\t'.join(lst[1:])
+ if not id in d:
+ d[id] = s
+ else:
+ d[id] += '\t' + s
+
+f0.close()
+
+agi2genename_dict = make_AGI_to_gene_name_dict(AGI_TO_GENE_NAMES)
+
+locus_file = sys.argv[1] # location of genes
+f = open(locus_file) # see all-ath-gene-position.txt
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ x = lst[3]
+ c = lst[0]
+ ss = lst[1]
+ ee = lst[2]
+ strand = lst[5]
+ result = [x]
+ if x in agi2genename_dict and not x == agi2genename_dict[x]:
+ result.append(agi2genename_dict[x])
+ else:
+ result.append(' ') # if no gene name, use a space
+ result.extend([c, ss, ee, strand, get_description(x, d)])
+ print('\t'.join(result))
+
+f.close()
diff --git a/Code/process_3way_interaction.py b/Code/process_3way_interaction.py
new file mode 100644
index 0000000..0be9b59
--- /dev/null
+++ b/Code/process_3way_interaction.py
@@ -0,0 +1,48 @@
+# Purpose: convert results from three-way interaction analysis to edge format.
+# Usage: python process_3way_interaction.py output20160911.txt > edges.txt.interaction.wiggelab.timecourse
+# Create on 9 Aug 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+def get_2tf_1target_1score(s):
+ '''
+ s looks like 'AT1G73870_AT1G73870, AT5G10570_AT5G10570, AT2G05100_LHCB2.1 19.287 | 0.843 0.998 0.155 | -0.915 0.924 1.839 | 0.918 -0.419'
+ '''
+ lst = s.split()
+ tf1 = lst[0].split('_')[0]
+ tf2 = lst[1].split('_')[0]
+ target = lst[2].split('_')[0]
+ score = lst[3]
+ return (tf1, tf2, target, score)
+
+
+## main
+import sys
+from datetime import datetime
+
+f = open(sys.argv[1])
+lines = f.readlines()
+f.close()
+
+
+agi2name_dict = make_gene_name_AGI_map_dict('../Data/information/AGI-to-gene-names_v2.txt')
+
+result = ''
+for line in lines:
+ line = line.strip()
+ tf1, tf2, target, interaction_score_str = get_2tf_1target_1score(line)
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf1_str = tf1 + ' ' + get_gene_name(tf1, agi2name_dict)
+ tf2_str = tf2 + ' ' + get_gene_name(tf2, agi2name_dict)
+ score_str = '0.5'
+ cond_str = '.'
+ curr_date = datetime.now().strftime('%Y%m%d')
+ method_or_tissue = 'interact.with.%s' % (tf2 + '(' + get_gene_name(tf2, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf1_str, score_str, 'mix', '15', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+ method_or_tissue = 'interact.with.%s' % (tf1 + '(' + get_gene_name(tf1, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf2_str, score_str, 'mix', '15', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+
+
+print(result)
diff --git a/Code/process_3way_interaction2.py b/Code/process_3way_interaction2.py
new file mode 100644
index 0000000..7ce26b3
--- /dev/null
+++ b/Code/process_3way_interaction2.py
@@ -0,0 +1,58 @@
+# Purpose: convert results from three-way interaction analysis to edge format.
+# Usage: python process_3way_interaction.py ../Data/information/summary.txt > edges.txt.interaction.seeddata
+# Create on 9 Aug 2019 by Hui Lan <lanhui@zjnu.edu.cn>
+
+from geneid2name import make_gene_name_AGI_map_dict, get_gene_name
+
+def get_2tf_1target_1score(s):
+ '''
+ s looks like '398: ( AT3G10490;AT3G10480, AT1G03970, AT5G20910 ) 10.41 2'
+ '''
+
+ lst = s.split()
+ tf1_all = lst[2]
+ tf2_all = lst[3]
+ target = lst[4]
+ score = lst[6]
+ result = []
+ for tf1 in tf1_all.split(';'):
+ tf1 = tf1.replace(',', '')
+ for tf2 in tf2_all.split(';'):
+ tf2 = tf2.replace(',', '')
+ result.append((tf1, tf2, target, score))
+
+ return result
+
+## main
+import sys
+from datetime import datetime
+
+f = open(sys.argv[1])
+lines = f.readlines()
+f.close()
+
+
+agi2name_dict = make_gene_name_AGI_map_dict('../Data/information/AGI-to-gene-names_v2.txt')
+
+result = ''
+for line in lines[2:]:
+ line = line.strip()
+ lst = get_2tf_1target_1score(line)
+ for t in lst:
+ tf1, tf2, target, interaction_score_str = t
+ if tf1.startswith('AT') and tf2.startswith('AT') and target.startswith('AT'):
+ target_str = target + ' ' + get_gene_name(target, agi2name_dict)
+ tf1_str = tf1 + ' ' + get_gene_name(tf1, agi2name_dict)
+ tf2_str = tf2 + ' ' + get_gene_name(tf2, agi2name_dict)
+ score_str = '0.6'
+ cond_str = '.'
+ curr_date = datetime.now().strftime('%Y%m%d')
+ method_or_tissue = 'interact.with.%s' % (tf2 + '(' + get_gene_name(tf2, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf1_str, score_str, 'mix', '138', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+ method_or_tissue = 'interact.with.%s' % (tf1 + '(' + get_gene_name(tf1, agi2name_dict) + ')')
+ s = '\t'.join([target_str, tf2_str, score_str, 'mix', '138', cond_str, '.', curr_date, interaction_score_str.replace('-',''), method_or_tissue])
+ result += s + '\n'
+
+
+print(result)
diff --git a/Code/refine_tissue.py b/Code/refine_tissue.py
new file mode 100644
index 0000000..8bc111c
--- /dev/null
+++ b/Code/refine_tissue.py
@@ -0,0 +1,302 @@
+# Usage: python refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt
+# Set cmd =
+#
+# Purpose: for each RNA-seq in experiment.and.tissue.1.txt, add a column suggested.tissue as its tissue annotation.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import os, sys, operator
+import string
+
+
+def get_singular_form(w):
+ d = {'seedlings':'seedling', 'roots':'root', 'leaves':'leaf', 'flowers':'flower', 'floral':'flower', 'shoots':'shoot', 'apices':'apex', 'stems':'stem', 'seeds':'seed', 'petals':'petals', 'sepals':'sepal', 'embryos':'embryo', 'embryonic':'embryo', 'cotyledons':'cotyledon', 'hairs':'hair', 'meristems':'meristem', 'epidermal':'epidermis', 'apical':'apex', 'buds':'bud', 'vacuoles':'vacuole', 'vacuolar':'vacuole', 'tips':'tip', 'pollens':'pollen', 'hypocotyls':'hypocotyl', 'tubes':'tube', 'stomatal':'stomata', 'ovule':'ovules', 'pistils':'pistil', 'anthers':'anther', 'carpels':'carpel', 'pedicle':'pedicel', 'vascular':'vasculum', 'whole plant':'seedling', 'inflorescence':'flower.inflorescence', 'inflorescences':'flower.inflorescence', 'whole seedling':'seedling', 'whole rosette':'leaf.rosette', 'whole aerial seedling':'seedling.aerial', 'vegatative shoot apical meristem':'shoot.apical.meristem', 'inflorescence containing stage 8 and younger flowers':'flower.inflorescence', 'plant roots':'root', 'entire vegetative rosette':'leaf.rosette', 'fungal-colonized plant roots':'root.fungal.colonized', 'rosettes - 5 leaves stage':'leaf.rosette', '2-week old seedlings without roots':'seedling.no.roots', 'immature inflorescence':'inflorescence.immature', 'rosette leaves':'leaf.rosette', 'plant seedling':'seedling', 'entire aerial part':'aerial.tissue', '14-d-old entire seedlings':'seedling', 'rosette leaf':'leaf.rosette', 'whole seedlings':'seedling', 'etiolated 5d-old seedlings':'seedling.etiolated', 'root structure':'root', 'mature leaves':'leaf.mature', 'root tip':'root.tip', '10d-old seedling':'seedling','primary root tip':'root.tip',
+ 'epidermis including guard cells':'epidermis',
+ 'root tip tissue':'root.tip',
+ 'anther stage 4-7':'flower.anther',
+ 'anther':'flower.anther',
+ 'embryo':'seed.embryo',
+ 'etiolated seedlings':'seedling.etiolated',
+ '21 days-old seedlings':'seedling',
+ 'aerial tissue':'aerial.tissue',
+ 'endosperm':'seed.endosperm',
+ 'whole seed':'seed',
+ 'pistils pollinated for 8 hours':'flower.pistil.pollinated',
+ 'primary root':'root',
+ 'whole floral bud':'flower.bud',
+ 'whole seedling root':'seedling.root',
+ 'whole root':'seedling.root',
+ 'whole plants':'seedling',
+ 'aerial shoots':'shoot',
+ 'flower bud':'flower.bud',
+ 'aerial seedling':'seedling.aerial',
+ 'anthers at stage 4-7':'flower.anther',
+ 'carpels (collected manually from 15 developing inflorescences)':'flower.carpel',
+ 'ath_shoot_meristem_1':'shoot.meristem',
+ 'ath_whole_plant_1':'seedling',
+ 'ath_whole_plant_2':'seedling',
+ 'whole seeds':'seed',
+ '3-day-old root':'root',
+ 'unopened flower buds':'flower.bud',
+ 'first true leaf':'leaf',
+ '3-day-old root':'root',
+ '7 dag seedlings':'seedling',
+ 'facs-sorted protoplasts from aerial tissue of 10-day old seedlings':'seedling.protoplasts',
+ 'root tip':'root.tip',
+ 'inflorescences and siliques':'inflorescences.and.siliques',
+ 'Epidermis including guard cells epidermis including guard cells':'leaf.stomata.epidermis',
+ 'base stem':'stem',
+ 'siliques':'silique',
+ 'whole organism':'seedling',
+ 'seedling shoot':'seedling.shoot',
+ 'aerial tissue':'aerial.tissue',
+ '10-day-old seedlings and inflorescences from 25-day-old plants':'seedling.and.inflorescence',
+ 'shoot apical meristem':'shoot.apical.meristem',
+ 'expanded mature leaves from 28 day old plants':'leaf',
+ 'aerial tissues of 15 day seedlings': 'aerial.tissue',
+ 'whole parts':'seedling',
+ 'aerial organs':'aerial.tissue',
+ 'lower stem':'stem',
+ 'upper stem':'stem',
+ 'rosette':'leaf.rosette',
+ 'root and shoot':'root.and.shoot',
+ 'cell culture':'cell.culture',
+ 'aerial part':'aerial.tissue',
+ 'aerial':'aerial.tissue',
+ 'whole plantlet without root':'seedling',
+ 'sorted endodermis (facs)':'endodermis.facs-sorted',
+ 'whole root':'root',
+ 'siluge without seeds':'seed',
+ 'first internode':'stem',
+ 'rosettes':'leaf.rosette',
+ 'hypocotyl':'seedling.hypocotyl',
+ 'somatic embryo':'seed.embryo'
+ }
+ if w in d:
+ return d[w]
+ return w
+
+def remove_parenthese(s):
+ if '(' in s:
+ return s[:s.find('(')]
+ return s
+
+
+
+def make_singular(lst):
+ result = []
+ # map plural to singular
+ d = {'roots':'root', 'shoots':'shoot',
+ 'leaves':'leaf', 'flowers':'flower',
+ 'anthers':'anther', 'hairs':'hair',
+ 'seedlings':'seedling', 'apices':'apex',
+ 'buds':'bud', 'siliques':'silique',
+ 'rosettes':'rosette', 'meristems':'meristem',
+ 'sepals':'sepal', 'petals':'petal',
+ 'inflorescences':'inflorescence', 'carpels':'carpel',
+ 'seeds':'seed', 'pistils':'pistil',
+ 'stamens':'stamen', 'ovules':'ovule',
+ 'tissues':'tissue', 'ovaries':'ovary',
+ 'veins':'vein', 'nodes':'node',
+ 'internodes':'internode', 'fibres':'fibre',
+ 'hypocotyls':'hypocotyl', 'cotyledons':'cotyledon',
+ 'plants':'plant', 'embryos':'embryo'}
+
+ for x in lst:
+ if x in d:
+ result.append(d[x])
+ else:
+ result.append(x)
+ return result
+
+def map_tissue(s):
+ ''' given a string s, if all words in a key of d are in s, then the corresponding value is a likely tissue. '''
+ d = {
+ 'hypocotyl':'seedling.hypocotyl',
+ 'hypocotyl seedling':'seedling.hypocotyl',
+ 'leaf':'leaf',
+ 'leaf petiole':'leaf.petiole',
+ 'petiole':'leaf.petiole',
+ 'leaf blade':'leaf.blade',
+ 'leaf first true':'leaf',
+ 'leaf stomata':'leaf.stomata',
+ 'stomata':'leaf.stomata',
+ 'chlorophyll':'leaf.chlorophyll',
+ 'vein':'leaf.vein',
+ 'leaf vein':'leaf.vein',
+ 'leaf lamina':'leaf.lamina',
+ 'leaf rosette':'leaf rosette',
+ 'rosette':'leaf.rosette',
+ 'rosette leaf':'leaf.rosette',
+ 'shoot':'shoot',
+ 'aerial shoot':'aerial.shoot',
+ 'shoot apex':'shoot.apex',
+ 'shoot tip':'shoot.apex',
+ 'flower':'flower',
+ 'flower petal':'flower.petal',
+ 'flower sepal':'flower.sepal',
+ 'flower stamen':'flower.stamen',
+ 'flower anther':'flower.anther',
+ 'flower carpel':'flower.carpel',
+ 'flower pistil':'flower.pistil',
+ 'flower inflorescence':'flower.inflorescence',
+ 'stigma':'flower.stigma',
+ 'filament':'flower.filament',
+ 'style':'flower.style',
+ 'anther':'flower.anther',
+ 'petal':'flower.petal',
+ 'sepal':'flower.sepal',
+ 'stamen':'flower.stamen',
+ 'carpel':'flower.carpel',
+ 'pistil':'flower.pistil',
+ 'ovary':'flower.ovary',
+ 'pedicel':'flower.pedicel',
+ 'ovule':'flower.ovule',
+ 'inflorescence':'flower.inflorescence',
+ 'seed':'seed',
+ 'epicotyl':'seed.epicotyl',
+ 'radicle':'seed.radicle',
+ 'embryo':'seed.embryo',
+ 'endosperm':'seed.endosperm',
+ 'endodermis':'endodermis',
+ 'stem':'stem',
+ 'pith':'pith',
+ 'protoxylem':'protoxylem',
+ 'xylem':'xylem',
+ 'phloem':'phloem',
+ 'sclerenchyma':'sclerenchyma',
+ 'bast fibre':'bast.fibre',
+ 'cortex':'cortex',
+ 'parenchyma':'parenchyma',
+ 'mesophyll':'leaf.mesophyll',
+ 'shoot apical meristem':'meristem.shoot.apical',
+ 'root apical meristem':'.meristem.root.apical',
+ 'apical meristem':'meristem.apical',
+ 'floral meristem':'meristem.floral',
+ 'inflorescence meristem':'meristem.inflorescence',
+ 'meristem':'meristem',
+ 'meristem shoot':'meristem.shoot',
+ 'cotyledon':'cotyledon',
+ 'apical':'apical',
+ 'basal':'basal',
+ 'root':'root',
+ 'root apex':'root.apex',
+ 'root tip':'root.tip',
+ 'root primary tip':'root.primary.tip',
+ 'root cap':'root.cap',
+ 'root lateral':'root.lateral',
+ 'root primary':'root.primary',
+ 'root hair':'root.hairs',
+ 'bud':'bud',
+ 'bud axillary':'bud.axillary',
+ 'bud lateral':'bud.axillary',
+ 'bud apical':'bud.apical',
+ 'bud floral':'bud.flower',
+ 'bud flower':'bud.flower',
+ 'bud meristem':'bud.meristem',
+ 'internode':'stem.internode',
+ 'node':'stem.node',
+ 'vascular':'vasculum',
+ 'epidermis':'epidermis',
+ 'seedling':'seedling',
+ 'plant':'seedling',
+ 'whole plant':'seedling',
+ 'whole':'seedling',
+ 'whole parts':'seedling',
+ 'whole root':'root',
+ 'seedling root':'seedling.root',
+ 'seedling shoot':'seedling.shoot',
+ 'seedling etiolated':'seedling.etiolated',
+ 'aerial':'aerial',
+ 'aerial tissue':'aerial.tissue',
+ 'aerial seedling':'seedling.aerial',
+ 'silique':'silique',
+ 'unknown':'unknown',
+ 'siluge':'seed',
+ 'bundle sheath':'leaf'
+ }
+ result = [] # a list of tuples, (tissue, word count)
+ s = s.lower()
+ slst = s.split()
+ slst2 = make_singular(slst)
+ for k in d: # search each key in d
+ klst = k.split()
+ count = 0
+ exact_count = 0
+ for x in klst:
+ count += slst2.count(x)
+ if x in slst2:
+ exact_count += 1
+ if count >= len(klst) and exact_count == len(klst):
+ result.append((d[k], count))
+ if result == []:
+ return 'unknown'
+ else:
+ sresult = sorted(result, key=operator.itemgetter(1), reverse=True)
+ return sresult[0][0]
+
+
+def repeat_words(s):
+ ''' s in the form of meristem(2) '''
+ s = s.strip()
+ index = s.find('(')
+ if index < 0:
+ return s
+ index2 = s.find(')')
+ word = s[:index]
+ n = s[(index+1):index2]
+ n = int(n)
+ return ' '.join(n*[word])
+
+def get_words(s):
+ ''' s in the form meristem(2);leaf(2);bud(1) or shoot.meristem '''
+ lst = s.split(';')
+ result = []
+ for x in lst:
+ index = x.find('(')
+ if index >= 0:
+ t = repeat_words(x)
+ result.append(t)
+ else:
+ t = x
+ if '.' in t:
+ for y in t.split('.'):
+ result.append(y)
+ return ' '.join(result)
+
+def remove_punctuation(s):
+ return s.replace('_', ' ')
+
+# main
+
+if os.path.exists('../Data/temp/experiment.and.tissue.1.txt'):
+ cmd = 'cut -f 1-4 ../Data/temp/experiment.and.tissue.1.txt > ../Data/temp/a.txt' # generated by python assign_tissue.py
+ os.system(cmd)
+else:
+ print('Run python assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt first.')
+ sys.exit()
+
+f = open('../Data/temp/a.txt')
+print('run.id\tinferred.tissue\tbiosample.tissue\tbiosample.id\tsuggested.tissue')
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+
+ if lst[2] != 'part_unknown' and lst[2] != '.':
+ s = lst[2].lower()
+ s = map_tissue(remove_punctuation(s))
+ # elif lst[2] == '.':
+ # s = lst[1]
+ # s = remove_parenthese(s)
+ else:
+ s = lst[1]
+ if not ';' in s:
+ s = remove_parenthese(s)
+ else:
+ s = get_words(s)
+
+ s = map_tissue(remove_punctuation(s))
+
+ print(line + '\t' + s)
+f.close()
diff --git a/Code/rnaseq_or_chipseq.py b/Code/rnaseq_or_chipseq.py
new file mode 100644
index 0000000..7d955eb
--- /dev/null
+++ b/Code/rnaseq_or_chipseq.py
@@ -0,0 +1,92 @@
+# Usage: python rnaseq_or_chipseq.py all.short.reads.txt > table.txt
+#
+# Purpose: check if a RUN id in all.short.reads.txt is RNA-seq or ChIP-seq or other. Adpated from parse_ena_xml.py.
+#
+# Note: to avoid encoding error while writing to output.txt, first type this command: export PYTHONIOENCODING=UTF-8.
+#
+# 13 Apr 2017, slcu, hui
+
+import os, json, re, operator, sys
+import xml.etree.ElementTree
+
+MAX_DESCRIPTION_LENGTH = 600 # max number to characters to keep in json file
+
+def parse_run(fname):
+ d = {}
+
+ root = xml.etree.ElementTree.parse(fname).getroot()
+
+ for c in root.findall('RUN'):
+ acc = c.get('accession')
+ d[acc] = {}
+
+ alias = c.get('alias')
+ d[acc]['alias'] = alias
+
+ experiment = c.find('EXPERIMENT_REF').get('accession')
+ d[acc]['experiment_id'] = experiment
+
+ title = c.find('TITLE').text
+ d[acc]['title'] = title
+
+ d[acc]['study_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ #print(s)
+ if 'RP' in s: # run project
+ d[acc]['study_id'] = s
+ break
+ d[acc]['sample_id'] = '.'
+ for i in c.findall('./RUN_LINKS/RUN_LINK/XREF_LINK/ID'):
+ s = i.text
+ if 'RS' in s: # run project
+ d[acc]['sample_id'] = s
+ break
+
+ return d
+
+def get_key(s):
+ if '_' in s:
+ return s[:s.find('_')]
+ else:
+ return s[:s.find('.')]
+
+def make_downloaded_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line != '' and not line.startswith('#'):
+ if line.startswith('-'):
+ fn = line.split(' ')[-1]
+ else:
+ fn = os.path.basename(line)
+ k = get_key(fn)
+ if not k in d:
+ d[k] = [fn]
+ else:
+ d[k].append(fn)
+ d[k] = sorted(list(set(d[k])))
+ return d
+
+## main
+
+cmd = 'export PYTHONIOENCODING=UTF-8' # since xml files contains non-ascii characters, use this command to avoid encoding error during redirection
+os.system(cmd)
+
+d = make_downloaded_dict(sys.argv[1])
+
+# ENA xml meta files do not differentiate between different types of Seq, but are organised by RUN, STUDY, EXPERIMENT. So each
+# of the following function is for each type of xml file.
+d_rnaseq_run = parse_run('../Data/information/ena_rnaseq_read_run.xml') # RUN
+d_chipseq_run = parse_run('../Data/information/ena_ChIP-Seq_read_run.xml') # RUN
+
+for k in sorted(d.keys()):
+ s = k + '\t' + ' '.join(d[k])
+ if k in d_rnaseq_run:
+ s += '\t' + 'RNA-seq'
+ elif k in d_chipseq_run:
+ s += '\t' + 'ChIP-seq'
+ else:
+ s += '\t' + '.'
+ print(s)
diff --git a/Code/slice_TPM_to_JSON.py b/Code/slice_TPM_to_JSON.py
new file mode 100644
index 0000000..e597b78
--- /dev/null
+++ b/Code/slice_TPM_to_JSON.py
@@ -0,0 +1,164 @@
+# Usage: python slice_TPM_to_JSON.py parameter_for_net.txt
+#
+# Purpose: Given the matrix TPM.txt, make logarithmised gene
+# expression in json format for each gene. Put the results in
+# JSON_DIR. The results are used for displaying scatterplots in
+# Webapp.
+#
+# Last modified 24 Apr 2017, slcu, hui [use r to do the job, faster]
+
+import sys, os, operator, itertools
+import numpy as np
+import json
+
+JSON_DIR = '../Data/history/expr/json' # contain json for all genes, one json file for each gene. Each json file has the following format {"R0ERR046550XXX": 2.8148097376737438, "R0ERR031542XXX": 2.5193080765053328, ...}
+
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+
+# read expression TPM
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ # d4_sorted = {}
+ # for k in d4:
+ # d4_sorted[k] = sorted(d4[k], reverse=True)
+ # d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+
+def take_log(x):
+ return np.log(x+1)
+
+
+def make_json_file(expr_dict, dir_name, glb_param_dict):
+ if not os.path.isdir(dir_name): # create the directory if not exist
+ os.makedirs(dir_name)
+
+ d = expr_dict['xy']
+ col_name_lst = expr_dict['colid']
+ row_name_lst = expr_dict['rowid']
+ for g in row_name_lst:
+ #print(g)
+ d2 = d[g]
+ if glb_param_dict['LOGRITHMIZE'].upper() == 'YES':
+ d3 = {k: take_log(v) for k, v in d2.items()}
+ else:
+ d3 = d2
+ filename = os.path.join(dir_name, g + '.json')
+ with open(filename, 'w') as f:
+ json.dump(d3, f)
+
+
+def make_json_file_using_r(dir_name, glb_param_dict): # use r script to make it faster
+ r_code = '''
+ library(rjson)
+ dir.name <- '%s'
+ tpm.file <- '%s'
+ take.log <- '%s'
+ X <- read.table(tpm.file, header=T, check.names=FALSE, sep="\\t")
+ gene.id <- as.vector(X[,1])
+ X[,1] <- NULL # remove first column
+ if (take.log == 'YES') {
+ X <- log(X+1)
+ }
+ if (!dir.exists(dir.name)) {
+ dir.create(dir.name)
+ }
+ for (i in 1:dim(X)[1]) {
+ y <- toJSON(X[i,])
+ file.name = paste(dir.name, paste(gene.id[i], 'json', sep='.'), sep='/')
+ cat(y, file=file.name)
+ }
+ ''' % (
+ dir_name,
+ glb_param_dict['EXPRESSION_MATRIX'],
+ glb_param_dict['LOGRITHMIZE'].upper())
+ f = open('slice_TPM_to_JSON.R', 'w') # make a R script
+ f.write('\n'.join([line.lstrip('\t') for line in r_code.split('\n')]))
+ f.close()
+ os.system('Rscript slice_TPM_to_JSON.R')
+ os.system('rm -f slice_TPM_to_JSON.R')
+
+
+## main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+#expr_dict = read_matrix_data(glb_param_dict['EXPRESSION_MATRIX'])
+#make_json_file(expr_dict, JSON_DIR, glb_param_dict) # slower version
+make_json_file_using_r(JSON_DIR, glb_param_dict) # faster version
diff --git a/Code/slice_binding_to_JSON.py b/Code/slice_binding_to_JSON.py
new file mode 100644
index 0000000..6421fed
--- /dev/null
+++ b/Code/slice_binding_to_JSON.py
@@ -0,0 +1,172 @@
+# Usage: python slice_binding_to_JSON.py parameter_for_net.txt
+import sys, os, operator, itertools
+import numpy as np
+import json
+
+JSON_DIR = '../Data/history/bind/json2' # contains json for all genes
+
+GLB_PARAM_SYMBOL = '%%'
+DATA_SYMBOL = '@'
+
+def read_matrix_data(fname):
+ '''
+ fname - a file, first line is head, first column is row name.
+ '''
+
+ lineno = 0
+ colid = []
+ rowid = []
+ d = {} # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+ d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+ d3 = {} # {gene1: [], gene2: [], ...}
+ d4 = {} # {cond1:[], cond2:[], ...}
+
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ head_line = lines[0].strip()
+ lst = head_line.split()
+ colid = lst[1:]
+
+ for c in colid:
+ d2[c] = {}
+ d4[c] = []
+
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split()
+ g = lst[0]
+ rowid.append(g)
+ d[g] = {}
+ levels = lst[1:]
+ if len(levels) != len(colid):
+ print('Incomplete columns at row %s' % (g))
+ sys.exit()
+
+ d3[g] = []
+ for i in range(len(colid)):
+ c = colid[i]
+ d[g][c] = float(levels[i])
+ d2[c][g] = float(levels[i])
+ d3[g].append(float(levels[i]))
+ d4[c].append(float(levels[i]))
+ lineno += 1
+
+ d_return = {}
+ d_return['xy'] = d # first gene, then condition
+ d_return['yx'] = d2 # first condition, then gene
+ d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+ d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+ d_return['nrow'] = lineno - 1
+ d_return['ncol'] = len(colid)
+ d_return['rowid'] = rowid
+ d_return['colid'] = colid
+
+ d4_sorted = {}
+ for k in d4:
+ d4_sorted[k] = sorted(d4[k], reverse=True)
+ d_return['yy_sorted'] = d4_sorted
+
+ return d_return
+
+# read paramters
+
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+
+def get_value(s, delimit):
+ lst = s.split(delimit)
+ return lst[1].strip()
+
+def read_info_data(fname):
+ ''' Read chip-seq data information '''
+
+ if not os.path.exists(fname):
+ print('%s not exists.' % (fname) )
+ sys.exit()
+
+ d = {'ID_LIST':[]}
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ if line == '' or line.startswith('#') or line.startswith('%'):
+ continue
+ if line.startswith(DATA_SYMBOL):
+ s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+ s = s.strip()
+ if s in d:
+ print('ID %s duplicate' % (s))
+ sys.exit()
+ d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+ d['ID_LIST'].append(s)
+ if line.startswith('DESCRIPTION:'):
+ d[s]['DESCRIPTION'] = get_value(line, ':')
+ elif line.startswith('PROTEN_NAME:'):
+ d[s]['PROTEN_NAME'] = get_value(line, ':')
+ elif line.startswith('PROTEIN_ID:'):
+ d[s]['PROTEIN_ID'] = get_value(line, ':')
+ elif line.startswith('DATA_NAME:'):
+ d[s]['DATA_NAME'] = get_value(line, ':')
+ elif line.startswith('DATA_FORMAT:'):
+ d[s]['DATA_FORMAT'] = get_value(line, ':')
+ elif line.startswith('LOCATION:'):
+ d[s]['LOCATION'] = get_value(line, ':')
+ elif line.startswith('NOTE:'):
+ d[s]['NOTE'] = get_value(line, ':')
+
+ return d
+
+
+def make_global_param_dict(fname):
+ f = open(fname)
+ d = {}
+ for line in f:
+ line = line.strip()
+ if line.startswith(GLB_PARAM_SYMBOL):
+ s = line[line.rfind(GLB_PARAM_SYMBOL[-1])+1:]
+ lst = s.split('\t') # separate items by TAB
+ for x in lst:
+ if x != '':
+ k, v = get_key_value(x)
+ d[k] = v
+ f.close()
+ return d
+
+
+def make_json_file(bind_dict, bind_info_dict, dir_name, glb_param_dict):
+ if not os.path.isdir(dir_name): # create the directory if not exist
+ os.makedirs(dir_name)
+
+ d = bind_dict['xy']
+ col_name_lst = bind_dict['colid']
+ row_name_lst = bind_dict['rowid']
+ for g in row_name_lst:
+ #print(g)
+ d2 = d[g]
+ d3 = {}
+ for k in sorted(d2.keys()):
+ data_type = bind_info_dict[k]['DATA_FORMAT'].upper()
+ if data_type == 'NARROWPEAK':
+ data_type = 'NP' # short name for narrowPeak
+ value = d2[k]
+ d3[k] = {'v':value, 't':data_type}
+ filename = os.path.join(dir_name, g + '.json')
+ with open(filename, 'w') as f:
+ json.dump(d3, f)
+
+
+### main
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+#print('Read binding matrix ...')
+binding_dict = read_matrix_data(glb_param_dict['BINDING_MATRIX'])
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+#print('Make json files ...')
+make_json_file(binding_dict, bind_info_dict, JSON_DIR, glb_param_dict)
diff --git a/Code/test_network4.py b/Code/test_network4.py
new file mode 100644
index 0000000..44ce492
--- /dev/null
+++ b/Code/test_network4.py
@@ -0,0 +1,205 @@
+# Make tissue specific networks
+
+import os, sys
+from geneid2name import make_gene_name_AGI_map_dict
+
+def get_tfs(fname_lst):
+ d = {}
+ for fname in fname_lst:
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ tf = lst[1].split()[0]
+ if not tf in d:
+ d[tf] = 1
+ else:
+ d[tf] += 1
+ f.close()
+ return d
+
+def get_tissue_from_fname(fname):
+ tissue_lst = [
+ 'seedling',
+ 'meristem',
+ 'flower',
+ 'aerial',
+ 'shoot',
+ 'seed',
+ 'leaf',
+ 'root',
+ 'stem']
+ for x in tissue_lst:
+ if x in fname:
+ return x
+ return 'unknown'
+
+def get_edges_consisting_of_tfs(fname_lst, tf_dict):
+ d = {}
+ for fname in fname_lst:
+ kt = get_tissue_from_fname(fname)
+ d[kt] = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0].strip()
+ tf = lst[1].split()[0].strip()
+ k = target + '_' + tf
+ score = float(lst[2])
+ if tf in tf_dict and target in tf_dict:
+ if not k in d[kt]:
+ d[kt][k] = [(lst[0], lst[1], score)]
+ else:
+ d[kt][k].append((lst[0], lst[1], score))
+ f.close()
+ return d
+
+def get_degree(fname_lst, tf_dict):
+ d_out = {}
+ d_in = {}
+ d_all = {}
+ for fname in fname_lst:
+ kt = get_tissue_from_fname(fname)
+ d_out[kt] = {}
+ d_in[kt] = {}
+ d_all[kt] = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0].strip()
+ tf = lst[1].split()[0].strip()
+ if True or tf in tf_dict and target in tf_dict:
+ if not tf in d_out[kt]:
+ d_out[kt][tf] = 1
+ else:
+ d_out[kt][tf] += 1
+
+ if not target in d_in[kt]:
+ d_in[kt][target] = 1
+ else:
+ d_in[kt][target] += 1
+
+ if not target in d_all[kt]:
+ d_all[kt][target] = 1
+ else:
+ d_all[kt][target] += 1
+
+ if not tf in d_all[kt]:
+ d_all[kt][tf] = 1
+ else:
+ d_all[kt][tf] += 1
+
+ f.close()
+ return d_all, d_out, d_in
+
+
+def simplify(s):
+ result = ''
+ lst = s.split('\t')
+ a = (lst[0].split()[1]).split(';')[0]
+ if a == '.':
+ a = lst[0].split()[0]
+ else:
+ a = lst[0].split()[0] + '_' + (lst[0].split()[1]).split(';')[0]
+ b = (lst[1].split()[1]).split(';')[0]
+ if b == '.':
+ b = lst[1].split()[0]
+ else:
+ b = lst[1].split()[0] + '_' + (lst[1].split()[1]).split(';')[0]
+ return '%s\t%s\t%s' % (a, b, lst[2])
+
+# main
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+
+edge_file_lst = [
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.seedling.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.meristem.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.flower.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.aerial.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.shoot.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.seed.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.leaf.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.root.txt.20170629_203729',
+ '/home/hui/network/v03/Data/history/edges/many_targets/edges.txt.simple.correlation.stem.txt.20170629_203729'
+]
+
+tf_dict = get_tfs(edge_file_lst)
+
+f = open('result.skeleton.txt', 'w')
+print('Total number of TFs: %d' % (len(tf_dict)))
+d0 = get_edges_consisting_of_tfs(edge_file_lst, tf_dict)
+for kt in d0: # kt is tissue
+ f.write('##TF skeleton size in %s: %d.\n' % (kt, len(d0[kt])))
+ d = d0[kt]
+ for k in d:
+ lst = d[k]
+ for x in lst: # {'shoot':{'target_tf':[], }, 'flower':{} }
+ max_score = -9
+ s = ''
+ if abs(x[2]) > max_score:
+ s = '%s\t%s\t%4.2f' % (x[0], x[1], x[2])
+ max_score = x[2]
+ f.write(simplify(s) + '\n')
+f.close()
+
+# for each TF, get its out-degree and in-degree in each tissue
+dd_all, dd_out, dd_in = get_degree(edge_file_lst, tf_dict)
+f = open('result.out.txt', 'w')
+head_lst = ['TF']
+for k in dd_out:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_out:
+ if tf in dd_out[k]:
+ s += '\t%d' % (dd_out[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
+
+f = open('result.in.txt', 'w')
+head_lst = ['TF']
+for k in dd_in:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_in:
+ if tf in dd_in[k]:
+ s += '\t%d' % (dd_in[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
+
+f = open('result.all.txt', 'w')
+head_lst = ['TF']
+for k in dd_all:
+ head_lst.append(k)
+f.write('%s\n' %('\t'.join(head_lst)))
+for tf in tf_dict:
+ s = tf
+ name = '.'
+ if tf in agi2name_dict and agi2name_dict[tf] != tf:
+ name = agi2name_dict[tf]
+ s += ' ' + name
+ for k in dd_all:
+ if tf in dd_all[k]:
+ s += '\t%d' % (dd_all[k][tf])
+ else:
+ s += '\t0'
+ f.write(s + '\n')
+f.close()
diff --git a/Code/text2json.py b/Code/text2json.py
new file mode 100644
index 0000000..dcfb699
--- /dev/null
+++ b/Code/text2json.py
@@ -0,0 +1,19 @@
+# Usage: python text2json AGI-to-gene-names_v2.txt > genes.json
+# Purpose: convert AGI-to-gene-names_v2.txt to genes.json for brain main page. Put genes.json under ../Webapp/static/json
+import sys
+
+f = open(sys.argv[1])
+
+count = 0
+s = '{'
+for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ s += '\"%s\":' % ('label'+str(count))
+ s += '\"%s\",' % (lst[0]+' '+lst[1])
+ count += 1
+
+f.close()
+s = s[:-1]
+s += '}'
+print(s)
diff --git a/Code/update_network.py b/Code/update_network.py
new file mode 100755
index 0000000..e29eac1
--- /dev/null
+++ b/Code/update_network.py
@@ -0,0 +1,895 @@
+#! /usr/bin/python3
+# Usage: python3 update_network.py
+# Put this script under directory Code/.
+# IMPORTANT: Run this script under directory Code/.
+# Execute the above command regularly, or
+# Cron job this command to make it run everyday at 5am:
+#
+# 1. crontab -e.
+# 2. Add this line: 01 05 * * * cd /home/hui/network/v03/Code && python3 update_network.py
+#
+# IMPORTANT: Make sure execute this script (update_network.py) under the directory Code.
+#
+# Purpose: periodically (e.g., per week) run this script to see if the network needs update. If yes, update it.
+#
+# Set HOLDON=NO in parameter_for_buildCmatrix.txt,
+# parameter_for_buildRmatrix.txt and parameter_for_net.txt to make
+# changes in these file effective.
+#
+# parameter_for_buildCmatrix.txt will be updated automatically (I
+# hope). However, we need to update parameter_for_buildCmatrix.txt
+# manually.
+#
+# Revision history:
+#
+# Last modified: 26 Feb 2017
+# Last modified: 17 Mar 2017
+# Last modified: 04 Apr 2017
+# Last modified: 05 Apr 2017
+# Last modified: 10 Apr 2017
+# Last modified: 19 Apr 2017
+# Last modified: 20 Apr 2017 [addded create_edges0B.py which calls correlation_per_tissue.R]
+# Last modified: 21 Jun 2017 [added correlation_per_group.R and wedge.R]
+# Last modified: 30 Jun 2017 [added get_sample_size so that we have sample size for correlations of type all, added in ll_dict ]
+# Last modified: 23 Jan 2018 [edited a few print-out messages]
+# Last modified: 25 Jan 2018 [updated function compute_metric(), set S=365.0 and modified return statement]
+# Last modified: 24 Aug 2018 [updated function from get_sample_size(d, sorted_keys, day) to get_sample_size(d, sorted_keys, day, rcond_string)]
+# Last modified: 03 Feb 2019
+# Last modified: 08 Aug 2019, hui
+# Last modified: 10 Aug 2019, hui <lanhui@zjnu.edu.cn>
+# Last modified: 23 Aug 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools(num_component)]
+# Last modified: 10 Sep 2019, hui <lanhui@zjnu.edu.cn> [correlation_mixtools, check the previous R session has finished before starting a new one.]
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+from param4net import make_global_param_dict, get_key_value
+from configure import HISTORY_DIR, HISTORY_DIR2, FILE_TIMESTAMP, SAMPLE_SIZE_FILE, TEMP_DIR, \
+ PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, \
+ PARAMETER_FOR_NET, PARAMETER_FOR_NET_TRAVADB_STRESS, PARAMETER_FOR_NET_TRAVADB_MAP, PARAMETER_FOR_NET_MILD_DROUGHT, PARAMETER_FOR_NET_WIGGELAB_DIURNAL, \
+ BINDING_FILE, TPM_FILE, \
+ PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL, MIN_RNA_SEQ_INCREASE, UPDATE_NETWORK_LOG_FILE, NEW_OR_UPDATED_CHIP_FILE, \
+ RNA_SEQ_INFO_DATABASE, RNA_SEQ_INFO_DATABASE_JSON, GENE_ID_FIRST_TWO_LETTERS, MEMORY_STRENGTH, \
+ MAPPED_RDATA_DIR, MAPPED_CDATA_DIR, \
+ EDGE_POOL_DIR, MERGED_EDGE_FILE, \
+ TARGET_TF_FILE
+
+
+
+## Helper functions
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+
+def validate_webapp_dir(para_for_net):
+ ''' Make sure this function is executed under the directory Code. '''
+ glb_param_dict = make_global_param_dict(para_for_net)
+ # if genes.json is not present, create one
+ if not os.path.exists('../Webapp/static/json/genes.json'):
+ print('[update_network.py]: cannot find genes.json, make one ...')
+ cmd = 'python3 text2json.py %s > ../Webapp/static/json/genes.json' % (glb_param_dict['GENE_ID_AND_GENE_NAME'])
+ os.system(cmd)
+
+
+def make_paths(s):
+ if not os.path.isdir(s):
+ os.makedirs(s)
+
+
+def make_important_dirs():
+ make_paths('../Data/history/edges/many_targets')
+ make_paths('../Data/history/edges/one_target')
+ make_paths('../Data/log')
+ make_paths('../Data/information')
+ make_paths('../Data/temp')
+ make_paths('../Data/upload')
+ make_paths('../Data/parameter')
+ make_paths('../Data/R/Mapped')
+ make_paths('../Data/R/Mapped/public')
+ make_paths('../Data/R/Mapped/inhouse')
+ make_paths('../Data/R/Mapped/other')
+ make_paths('../Data/R/Raw')
+ make_paths('../Data/C/Mapped')
+ make_paths('../Data/C/Raw')
+ make_paths('../Data/history/edges')
+ make_paths('../Data/history/edge_pool')
+ make_paths('../Data/history/bind')
+ make_paths('../Data/history/expr')
+ make_paths('../Webapp/static/json')
+ make_paths('../Webapp/static/edges')
+ make_paths('../Webapp/templates')
+
+
+def num_line(fname):
+ ''' Return number of lines in file fname. '''
+ if not os.path.exists(fname):
+ return 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+
+def num_ids(fname):
+ ''' Return number of IDs in fname. '''
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines[0].split('\t')) - 1
+
+
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+ print('Log: %s' % (s.strip()))
+
+
+def write_sample_size_file(sample_size_file, curr_date, tpm_sample_size):
+ if not os.path.exists(sample_size_file):
+ f = open(sample_size_file, 'w')
+ else:
+ f = open(sample_size_file, 'a')
+ f.write('%s\t%s\n' % (curr_date, tpm_sample_size))
+ f.close()
+
+
+def age_of_file_in_days(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ days = (time.time() - st.st_mtime)/(3600*24.0)
+ return days
+
+
+def age_of_file_in_seconds(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ seconds = time.time() - st.st_mtime
+ return seconds
+
+
+def hold_on(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines[:100]: # check the first 100 lines for HOLDON
+ line = line.strip()
+ if line.startswith('%%HOLDON=YES'):
+ return True
+ return False
+
+
+def all_files_present(lst):
+ missing_file_lst = []
+ for path in lst: # lst is a list of file names to check
+ if not os.path.exists(path):
+ if 'edges.txt' in path:
+ write_log_file('[update_network.py] WARNING: must have %s to update network. Call create_edges*.py to create edge files.' % (path), UPDATE_NETWORK_LOG_FILE)
+ missing_file_lst.append(path)
+ return missing_file_lst
+
+
+def record_file_time(lst, fname):
+ '''
+ lst - a list of files
+ fname - a recorder file
+ '''
+ f = open(fname, 'w')
+ s = ''
+ for x in lst:
+ if os.path.exists(x):
+ s += '%s\t%d\n' % (os.path.basename(x), int(os.stat(x).st_mtime))
+ else:
+ s += '%s\t%d\n' % (os.path.basename(x), 0)
+ f.write(s)
+ f.close()
+
+
+def read_file_timestamp(ftimestamp):
+ d = {}
+ f = open(ftimestamp)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ fname = lst[0]
+ t = lst[1]
+ d[fname] = int(t)
+
+ f.close()
+ return d
+
+
+def file_updated(fname, d):
+ ft = int(os.stat(fname).st_mtime)
+ k = os.path.basename(fname)
+ return ft > d[k]
+
+
+def get_updated_files(lst, d):
+ result = []
+ for x in lst:
+ if file_updated(x, d):
+ result.append(os.path.basename(x))
+ return result
+
+
+def get_sample_size(d, sorted_keys, day, rcond_string):
+
+ if rcond_string.isdigit():
+ return int(rcond_string)
+
+ if len(d) == 0:
+ return 1200 # a default number of sample size, CHANGE
+
+ for x in sorted_keys:
+ if x >= day:
+ return d[x]
+
+ k = sorted_keys[-1] # last key, latest date
+ return d[k]
+
+
+def number_rnaseq_id(tpm_file):
+ f = open(tpm_file)
+ first_line = f.readlines()[0]
+ f.close()
+ first_line = first_line.strip()
+ return len(first_line.split()) - 1
+
+
+def number_rnaseq_diff(para_file, tpm_file):
+ ''' count the number @ in para_file, and count the number of columns in tpm_file, return their difference '''
+ a = 0
+ f = open(para_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('@'):
+ a += 1
+ f.close()
+
+ b = number_rnaseq_id(tpm_file)
+
+ return a - b
+
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('[update_network.py]:Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('[update_network.py]:Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('[Warning] update_network.py: Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('[update_network.py]:Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('[update_network.py]:Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('[update_network.py]:Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('[update_network.py]:Must contain at least one ChIP-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_buildrmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('[update_network.py]:Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_LIST' in d:
+ print('[update_network.py]:Must specify GENE_LIST.')
+ sys.exit()
+ if location_count == 0:
+ print('[update_network.py]:Must contain at least one RNA-seq.')
+ sys.exit()
+
+
+def validate_parameter_for_net(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('[update_network.py]:CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_LIST':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_ID_AND_GENE_NAME':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'EXPRESSION_INFO':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ sys.exit()
+ if k == 'BINDING_MATRIX':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if k == 'EXPRESSION_MATRIX':
+ if not os.path.exists(v):
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+ if not 'GENE_LIST' in d:
+ print('[update_network.py]:Must specify GENE_FILE.')
+ sys.exit()
+ if not 'GENE_ID_AND_GENE_NAME' in d:
+ print('[update_network.py]:Must specify GENE_ID_AND_GENE_NAME.')
+ sys.exit()
+ if not 'BINDING_INFO' in d:
+ print('[update_network.py]:Must specify BINDING_INFO.')
+ sys.exit()
+ if not 'EXPRESSION_INFO' in d:
+ print('[update_network.py]:Must specify EXPRESSION_INFO.')
+ sys.exit()
+ if not 'BINDING_MATRIX' in d:
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildCmatrix.py paramter_for_buildCmatrix.txt > binding.txt to create binding.txt.')
+ if not 'EXPRESSION_MATRIX' in d:
+ print('[update_network.py]:%s not exists.' % (v))
+ print('[update_network.py]:Use python3 buildRmatrix.py paramter_for_buildRmatrix.txt to create TPM.txt.')
+
+
+
+def need_update_parameter_file(param_file, dirs):
+ ''' Make sure param_file is consistent with dirs (a list of directories to check against). '''
+ result = []
+
+ files_in_parameter = {}
+ f = open(param_file)
+ for line in f:
+ line = line.strip()
+ if line.startswith('LOCATION:'):
+ lst = line.split(':')
+ k = os.path.abspath(lst[1])
+ files_in_parameter[k] = 1
+ f.close()
+ param_modification_time = os.path.getmtime(param_file)
+
+ files_in_dirs = {}
+ for directory in dirs:
+ for root, dirnames, filenames in os.walk(os.path.abspath(directory)):
+ for filename in filenames:
+ k = os.path.join(root, filename)
+ files_in_dirs[k] = 1
+ if 'narrowPeak' in k or '_quant' in k:
+ if not k in files_in_parameter and os.path.getmtime(k) > param_modification_time:
+ result.append('%s is not in %s' % (k, param_file))
+
+ return result
+
+
+
+def validate_binding_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ for line in lines:
+ line = line.strip()
+ if 'buildCmatrix: ChIP-seq ID list is empty.' in line:
+ return False
+ f.close()
+ return True
+
+
+def lines_with_10_fields(s):
+ result = []
+ for line in s.split('\n'):
+ line = line.strip()
+ if len(line.split('\t')) == 10:
+ result.append(line)
+ return result
+
+
+def concatenate_edge_files(fname_lst, fname_out):
+ fout = open(fname_out, 'w')
+ for fname in fname_lst:
+ f = open(fname)
+ s = f.read()
+ f.close()
+ # Make sure each edge has 10 fields before writing.
+ lines = lines_with_10_fields(s)
+ if lines != []:
+ write_log_file('[update_network.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
+ fout.write('\n'.join(lines) + '\n')
+ else:
+ write_log_file('[update_network.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
+ fout.close()
+
+
+def delete_edge_files(fname_lst):
+ for fname in fname_lst:
+ # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
+ if age_of_file_in_seconds(fname) > 12*60*60: # 10 minutes
+ os.remove(fname)
+ else:
+ write_log_file('[update_network.py] In function delete_edge_files. Check file %s. It is probably still being written. So I don\'t delete it.' % (fname), UPDATE_NETWORK_LOG_FILE)
+
+
+def create_edges0():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+ # The following commands are optional. For example, if a user wants to run it locally, he don't have to provide these TPM tables.
+ if os.path.exists(PARAMETER_FOR_NET_TRAVADB_STRESS):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_STRESS)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_TRAVADB_MAP):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_TRAVADB_MAP), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_TRAVADB_MAP)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_MILD_DROUGHT):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_MILD_DROUGHT), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_MILD_DROUGHT)
+ #os.system(cmd)
+
+ if os.path.exists(PARAMETER_FOR_NET_WIGGELAB_DIURNAL):
+ write_log_file('[update_network.py] Create simple edges.txt using create_edges0.py with %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0.py %s' % (PARAMETER_FOR_NET_WIGGELAB_DIURNAL)
+ #os.system(cmd)
+
+
+def create_edges0B():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create tissue-specific edges.txt using new binding.txt (size=%d). create_edges0B.py' % (num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 create_edges0B.py %s' % (PARAMETER_FOR_NET) # call correlation_per_tissue.R
+ os.system(cmd)
+
+
+def wedge():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create edges using wedge shapes. wedge.R', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript wedge.R'
+ os.system(cmd)
+
+
+def correlation_per_group():
+ # For 3,130 RNA-seq samples and 30,000 pairs, need at least 10 hours.
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create group-specific edges.txt using new TPM.txt (size=%d). correlation_per_group.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript correlation_per_group.R'
+ os.system(cmd)
+
+
+def correlation_per_group_fixed_number():
+ if os.path.exists(PARAMETER_FOR_NET):
+ write_log_file('[update_network.py] Create group-specific (fixed) edges.txt using new TPM.txt (size=%d). correlation_per_group_fixed_number.R' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript correlation_per_group_fixed_number.R'
+ os.system(cmd)
+
+
+def correlation_mixtools(num_component):
+ if os.system('pidof R') != 0: # since it take long time (several days) to run create_edges_mixtool.R, so we make sure the previous R computing has finished before we start a new one. os.system returns 0 if R is running.
+ write_log_file('[update_network.py] Create edges.txt using TPM.txt (size=%d). create_edges_mixtool.R with %d components.' % (number_rnaseq_id(TPM_FILE), num_component), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'Rscript create_edges_mixtool.R %d' % (num_component)
+ os.system(cmd)
+
+
+def check_rnaseq_info():
+ # check rnaseq_info_database.txt and rnaseq_info_database.json, if they are outdated, then remind us to update it in log file.
+ if os.path.exists(RNA_SEQ_INFO_DATABASE):
+ if age_of_file_in_days(RNA_SEQ_INFO_DATABASE) > 90: # older than 120 days
+ write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE, age_of_file_in_days(RNA_SEQ_INFO_DATABASE)), UPDATE_NETWORK_LOG_FILE)
+ else:
+ write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE), UPDATE_NETWORK_LOG_FILE)
+
+ if os.path.exists(RNA_SEQ_INFO_DATABASE_JSON):
+ if age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON) > 90:
+ write_log_file('[update_network.py] Need update %s. It is %d days old.' % (RNA_SEQ_INFO_DATABASE_JSON, age_of_file_in_days(RNA_SEQ_INFO_DATABASE_JSON)), UPDATE_NETWORK_LOG_FILE)
+ else:
+ write_log_file('[update_network.py] [MISSING] Must create %s.' % (RNA_SEQ_INFO_DATABASE_JSON), UPDATE_NETWORK_LOG_FILE)
+
+
+# def check_process(name):
+# ''' If a process name exists, return 1; otherwise return 0.'''
+# os.system('ps -eF | grep \'%s\' > ../Data/running_processes.txt' % (name))
+# f = open('../Data/running_processes.txt')
+# lines = f.readlines()
+# f.close()
+# for line in lines:
+# line = line.strip()
+# lst = line.split()
+# if 'python' in lst[-2] and name in lst[-1]:
+# return 1
+# return 0
+
+
+
+## main
+
+# if check_process('update_network.py') == 1: # the old update_network.py is running
+# write_log_file('[update_network.py] update_network.py has not finished yet.', UPDATE_NETWORK_LOG_FILE)
+# sys.exit()
+
+
+FILE_LIST_TO_CHECK = [PARAMETER_FOR_BUILDCMATRIX, PARAMETER_FOR_BUILDRMATRIX, PARAMETER_FOR_NET, \
+ MERGED_EDGE_FILE, BINDING_FILE, TPM_FILE] # a list of very important files
+
+make_important_dirs() # make important directories (if non-existent) for holding various kinds of files, must be put after os.chdir(CODE_DIR)
+#validate_webapp_dir(PARAMETER_FOR_NET) # make sure the directory Webapp contains necessary files, e.g., genes.json.
+
+check_rnaseq_info() # rnaseq informtion is useful for displaying scatterplots
+
+# Make sure all necessary files are present, if not, make them if possible
+miss_lst = all_files_present(FILE_LIST_TO_CHECK) # check if any of them are missing
+if miss_lst != []: # miss_lst is non-empty in the beginning.
+ print('These mandatory files are missing: %s.\nPrepare them first.' % (' '.join(miss_lst)))
+ write_log_file('[update_network.py] Cannot find these required files:%s' % (' '.join(miss_lst)), UPDATE_NETWORK_LOG_FILE)
+
+ # initially, we (at most) only have three parameter files, no binding.txt, TPM.txt or edges.txt ...
+ important_miss_number = 0
+ if PARAMETER_FOR_BUILDCMATRIX in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDCMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_BUILDRMATRIX in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_BUILDRMATRIX))
+ important_miss_number += 1
+
+ if PARAMETER_FOR_NET in miss_lst:
+ print('[update_network.py]: must prepare %s first.' % (PARAMETER_FOR_NET))
+ important_miss_number += 1
+
+ if important_miss_number > 0:
+ sys.exit() # need to provide all the above three files; otherwise cannot proceed
+
+ if BINDING_FILE in miss_lst:
+ print('[update_network.py]: make initial binding.txt ... wait')
+ write_log_file('[update_network.py] Make initial binding.txt', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX)
+ #os.system(cmd)
+ cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE)
+ #os.system(cmd)
+ print('[update_network.py]: IMPORATNT: make sure BINDING_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, BINDING_FILE))
+ sys.exit()
+
+ if TPM_FILE in miss_lst:
+ print('[update_network.py]: make initial TPM.txt ... wait')
+ write_log_file('[update_network.py] Make initial TPM.txt', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt
+ #os.system(cmd)
+ print('[update_network.py]:IMPORTANT: make sure EXPRESSION_MATRIX in %s was set %s and rerun update_network.py.' % (PARAMETER_FOR_NET, TPM_FILE))
+ sys.exit()
+
+ miss_lst2 = all_files_present(FILE_LIST_TO_CHECK) # check files again
+ if len(miss_lst2) == 1 and miss_lst2[0] == MERGED_EDGE_FILE: # all other files are ready except edges.txt, make one.
+ print('[update_network.py]: make initial edges.txt ... wait')
+ create_edgeds0()
+
+
+# Make json2 (sliced binding.txt) if it does not exist. Copy json2 to
+# the web application folder static/edges [manual] for displaying
+# binding strength plots.
+if not os.path.isdir('../Data/history/bind/json2') and os.path.exists(BINDING_FILE):
+ write_log_file('Make directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+
+# Make json (sliced TPM.txt) if it does not exist. Copy json to the
+# web application folder static/edges [manual] for displaying gene
+# expression scatterplots.
+if not os.path.isdir('../Data/history/expr/json') and os.path.exists(TPM_FILE):
+ write_log_file('Make directory ../Data/history/expr/json. Don\'t forget to copy json to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ os.system(cmd)
+
+
+# Make sure parameter files are present and valid (rudimentary check but important)
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+validate_parameter_for_buildrmatrix(PARAMETER_FOR_BUILDRMATRIX)
+validate_parameter_for_net(PARAMETER_FOR_NET)
+
+
+# If the file timestamp does not exist, create one
+if not os.path.exists(FILE_TIMESTAMP):
+ record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+
+# get update time of mandatory files
+timestamp_dict = read_file_timestamp(FILE_TIMESTAMP)
+
+
+
+################## binding.txt stuff #####################################
+# Check parameter_for_buildCmatrix.txt
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'parameter_for_buildCmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDCMATRIX):
+ write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDCMATRIX), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Make binding column files', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 get_binding.py %s' % (PARAMETER_FOR_BUILDCMATRIX) # won't re-compute existing binding columns unless updated
+ os.system(cmd)
+
+
+ # # We will only consider ChIP-seq IDs that are less than 7 days
+ # # old. Make sure put 'update:yymmdd' in the 'NOTE:' field in
+ # # parameter_for_buildCmatrix.txt for each newly added ChIP-seq
+ # # data.
+ # write_log_file('[update_network.py] Build binding matrix from recently added/modified ChIP-seq data.', UPDATE_NETWORK_LOG_FILE)
+ # TEMP_BINDING_FILE = BINDING_FILE + '.temp'
+ # cmd = 'python3 buildCmatrix.py %s > %s' % (PARAMETER_FOR_BUILDCMATRIX, TEMP_BINDING_FILE)
+ # os.system(cmd)
+
+ # # If someone just touched prameter_for_buildCmatrix.txt without
+ # # adding any new ChIP-seq data, we should do nothing.
+ # if validate_binding_file(TEMP_BINDING_FILE):
+ # write_log_file('[update_network.py] Overwrite binding.txt.', UPDATE_NETWORK_LOG_FILE)
+ # cm = 'mv %s %s' (TEMP_BINDING_FILE, BINDING_FILE) # Overwrite binding.txt. Make it formal.
+ # os.system(cmd)
+ # write_log_file('[update_network.py] binding.txt is updated. Number of columns in %s = %d.' % (BINDING_FILE, num_ids(BINDING_FILE)), UPDATE_NETWORK_LOG_FILE)
+
+ # write_log_file('[update_network.py] Update target tf file %s.' % (TARGET_TF_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
+ # os.system(cmd)
+ # else:
+ # write_log_file('[update_network.py] [WARNING] Invalid binding matrix.', UPDATE_NETWORK_LOG_FILE)
+ # os.remove(TEMP_BINDING_FILE)
+
+
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'binding.txt' in updated_file_list:
+ write_log_file('[update_network.py] binding.txt has been updated. This update will take effect next time TPM.txt is updated.', UPDATE_NETWORK_LOG_FILE)
+ # create_edges0()
+ # create_edges0B()
+ # wedge()
+ # correlation_per_group()
+ # correlation_per_group_fixed_number()
+ # correlation_mixtools(2)
+ # correlation_mixtools(3)
+
+ ## TODO mixtool stuff, forget it for now.
+ #cmd = 'nohup python3 create_edges4.py %s &' % (temp_file_name)
+ #os.system(cmd)
+
+
+
+
+################## TPM.txt stuff #####################################
+
+# update parameter_for_buildRmatrix.txt periodically and automatically.
+if datetime.now().day % PARAMETER_FOR_BUILDRMATRIX_RENEW_INTERVAL == 0: # check if need to update parameter_for_buildRmatrix.txt bi-weekly
+ curr_time = datetime.now().strftime('%Y%m%d%H%M')
+ new_parameter_file = '../Data/temp/parameter_for_buildRmatrix.%s' % (curr_time)
+ cmd = 'python3 make_parameter_rnaseq.py > %s' % (new_parameter_file) # new_parameter_file will not be updated unless download_and_map.py has finished.
+ os.system(cmd)
+ num = number_rnaseq_diff(new_parameter_file, TPM_FILE)
+ if num >= MIN_RNA_SEQ_INCREASE: # sufficient number of RNA-seq samples have been added
+ write_log_file('[update_network.py] Update %s' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'cp %s %s' % (new_parameter_file, PARAMETER_FOR_BUILDRMATRIX)
+ os.system(cmd)
+
+ # Before we rewrite TPM.txt, we should backup the old TPM.txt
+ # write_log_file('[update_network.py] Backup %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'cp %s %s' % (TPM_FILE, TPM_FILE + '.backup.at.' + curr_time)
+ # os.system(cmd)
+
+ # write_log_file('[update_network.py] Rebuild %s' % (TPM_FILE), UPDATE_NETWORK_LOG_FILE)
+ # cmd = 'python3 buildRmatrix.py ../Data/parameter/parameter_for_buildRmatrix.txt'
+ # os.system(cmd)
+
+ else:
+ write_log_file('[update_network.py] You have downloaded %d RNA-seq since last build of TPM.txt. TPM.txt will be rebuilt if this number reaches %d.' % (num, MIN_RNA_SEQ_INCREASE), UPDATE_NETWORK_LOG_FILE)
+
+
+# Check if parameter_for_buildRmatrix.txt has been updated
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+# TODO To simplify things, I will provide TPM.txt directly. So set the
+# HOLDON option to YES in parameter_for_buildRmatrix.txt to prevent
+# the following from being True.
+if 'parameter_for_buildRmatrix.txt' in updated_file_list and not hold_on(PARAMETER_FOR_BUILDRMATRIX):
+ write_log_file('[update_network.py] Parameter file %s has been updated.' % (PARAMETER_FOR_BUILDRMATRIX), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Rebuild TPM.txt ...', UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
+ if os.path.exists(TPM_FILE):
+ backup_file_name = '../Data/history/expr/TPM.txt.backup.at.%s' % (curr_time)
+ cmd = 'cp %s %s' % (TPM_FILE, backup_file_name)
+ os.system(cmd)
+ cmd = 'gzip %s' % (backup_file_name)
+ os.system(cmd)
+
+ cmd = 'python3 buildRmatrix.py %s' % (PARAMETER_FOR_BUILDRMATRIX) # produce TPM.txt, the location of which is specified in TPM_TABLE in buidlRmatrix.py
+ os.system(cmd)
+
+ curr_date = datetime.now().strftime('%Y%m%d')
+ tpm_sample_size = number_rnaseq_id(TPM_FILE)
+ write_sample_size_file(SAMPLE_SIZE_FILE, curr_date, tpm_sample_size)
+
+
+
+# Create edges using all RNA-seq experiments
+updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+if 'TPM.txt' in updated_file_list: # we could touch TPM.txt to make it recent. We will recompute edges using the full binding.txt.
+ # Make a full binding.txt since we are going to use the new TPM.txt to recompute all edges
+ write_log_file('[update_network.py] Build full binding matrix for the new TPM.txt.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 buildCmatrix.py %s include-all > %s' % (PARAMETER_FOR_BUILDCMATRIX, BINDING_FILE) # include all ChIP-seq IDs. Pay attention to include-all in the command-line argument.
+ os.system(cmd)
+
+ # target_tf.txt
+ write_log_file('[update_network.py] Make target_tf.txt.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 make_target_tf.py %s > %s' % (PARAMETER_FOR_NET, TARGET_TF_FILE)
+ os.system(cmd)
+
+ write_log_file('[update_network.py] Update ../Data/history/expr/json using the new TPM.txt. Don\'t forget to update the static/edges/json folder in the web application.', UPDATE_NETWORK_LOG_FILE)
+ ## json -- make/renew json directory for displaying scatterplots
+ cmd = 'python3 slice_TPM_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ ## os.system(cmd) # turn this on if we are going to use this TPM.txt for displaying scatterplots
+ write_log_file('[update_network.py] Update directory ../Data/history/bind/json2. Don\'t forget to copy json2 to static/edges in the web application.', UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 slice_binding_to_JSON.py %s' % (PARAMETER_FOR_NET)
+ #os.system(cmd) # turn this on if we are going to use this bindingtxt for displaying bar charts of binding strengths
+ ## copy ../Data/history/bind/json2 and ../Data/history/expr/json to the web application folder 'static/edges' [manual]
+
+ if False: # TODO For now I will always use travadb's TPM.txt (138 columns) to display scatterplots. Simpler and faster.
+ write_log_file('Assign tissue, refine tissue and update rnaseq_info_database.json', UPDATE_NETWORK_LOG_FILE)
+ os.environ["PYTHONIOENCODING"] = "UTF-8" # for non-ascii letters in ENA RNA-sample description. If this statement does not work, try 'export PYTHONIOENCODING=UTF-8' in the command line instead. The export command can be put in crontab -e before running this script
+ cmd = 'python3 assign_tissue.py > ../Data/temp/experiment.and.tissue.1.txt'
+ os.system(cmd)
+ cmd = 'python3 refine_tissue.py > ../Data/information/experiment.and.tissue.2.txt'
+ os.system(cmd)
+ cmd = 'python3 update_rnaseq_info_json.py'
+ os.system(cmd)
+
+
+
+ # Compute edges. This could take a lot of time so update FILE_TIMESTAMP first.
+ record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+ create_edges0()
+ create_edges0B()
+ wedge()
+ correlation_per_group()
+ correlation_per_group_fixed_number()
+ #correlation_mixtools(2)
+ #correlation_mixtools(3)
+
+
+########## Merge edges #######################
+# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
+time.sleep(5)
+edge_file_lst = [] # collect edge files.
+most_recent_edge_modification_time = 0
+write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+write_log_file('[update_network.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+
+if edge_file_lst == []:
+ write_log_file('[update_network.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
+ # concatenate edge files into one
+ write_log_file('[update_network.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+ concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
+ delete_edge_files(edge_file_lst)
+
+if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
+ write_log_file('[update_network.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 merge_edges.py'
+ os.system(cmd)
+ write_log_file('[update_network.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ write_log_file('[update_network.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
+
+
+# exclude edges as suggested by Phil Wigge.
+# write_log_file('Exclude edges (now ineffective)', UPDATE_NETWORK_LOG_FILE)
+# cmd = 'python3 exclude_edges.py %s' % (EDGE_FILE)
+#os.system(cmd)
+
+
+# # check if parameter_for_net.txt, or TPM.txt is updated, if yes, create edges.
+# updated_file_list = get_updated_files(FILE_LIST_TO_CHECK, timestamp_dict)
+# if ('parameter_for_net.txt' in updated_file_list or 'TPM.txt' in updated_file_list) and not hold_on(PARAMETER_FOR_NET):
+# write_log_file('Create edges.txt using new TPM.txt (size=%d) ...' % (number_rnaseq_id(TPM_FILE)), UPDATE_NETWORK_LOG_FILE)
+# time.sleep(7200) # wait one hour for the previous create_edges4.py (if any) to finish creating JSON_DIR and target_tf_fname
+# cmd = 'nohup python3 create_edges4.py %s &' % (PARAMETER_FOR_NET) # put process to background
+# os.system(cmd)
+# time.sleep(60)
+
+
+# remove .R files in ../Data/temp. Files older than 3 days will be removed
+cmd = 'find %s -mtime +2 -name \"*.R\" -delete' % (TEMP_DIR)
+os.system(cmd)
+
+# update time stamp file
+record_file_time(FILE_LIST_TO_CHECK, FILE_TIMESTAMP)
+
+write_log_file('[update_network.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
+
diff --git a/Code/update_network_by_force.py b/Code/update_network_by_force.py
new file mode 100644
index 0000000..7ba8c87
--- /dev/null
+++ b/Code/update_network_by_force.py
@@ -0,0 +1,113 @@
+# Usage: python3 update_network_by_force.py
+# Purpose: update_network.py could take a few days to run. Run this script to harvest new edges everyday.
+#
+# Revision history:
+# Last modified: 24 Nov 2019, hui <lanhui@zjnu.edu.cn>
+
+import os, sys
+import glob
+import time
+from datetime import datetime
+from configure import HISTORY_DIR, HISTORY_DIR2, UPDATE_NETWORK_LOG_FILE, MERGED_EDGE_FILE, EDGE_POOL_DIR
+
+########## Helper functions #######################
+def write_log_file(s, fname):
+ f = open(fname, 'a')
+ curr_time = datetime.now().strftime('%Y-%m-%d %H:%M')
+ s = '[' + curr_time + ']: ' + s
+ if not '\n' in s:
+ s += '\n'
+ f.write(s)
+ f.close()
+ print('Log: %s' % (s.strip()))
+
+
+def num_line(fname):
+ ''' Return number of lines in file fname. '''
+ if not os.path.exists(fname):
+ return 0
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ return len(lines)
+
+
+def lines_with_10_fields(s):
+ result = []
+ for line in s.split('\n'):
+ line = line.strip()
+ if len(line.split('\t')) == 10:
+ result.append(line)
+ return result
+
+
+def age_of_file_in_seconds(fname):
+ ''' Return age of fname in days. '''
+ st = os.stat(fname)
+ seconds = time.time() - st.st_mtime
+ return seconds
+
+def concatenate_edge_files(fname_lst, fname_out):
+ fout = open(fname_out, 'w')
+ for fname in fname_lst:
+ f = open(fname)
+ s = f.read()
+ f.close()
+ # Make sure each edge has 10 fields before writing.
+ lines = lines_with_10_fields(s)
+ if lines != []:
+ write_log_file('[update_network_by_force.py] In function concatenate_edge_files. File %s has %d rows with 10 columns.' % (fname, len(lines)), UPDATE_NETWORK_LOG_FILE)
+ fout.write('\n'.join(lines) + '\n')
+ else:
+ write_log_file('[update_network_by_force.py] In function concatenate_edge_files. Check file %s. It has no rows with 10 fields.' % (fname), UPDATE_NETWORK_LOG_FILE)
+ fout.close()
+
+
+def delete_edge_files(fname_lst):
+ age_in_hours = 6
+ for fname in fname_lst:
+ # Before we delete, we should make sure it is not being written. Make sure it is old enough. Otherwise, don't delete.
+ if age_of_file_in_seconds(fname) > age_in_hours*60*60: # 6 hours
+ os.remove(fname)
+ else:
+ write_log_file('[update_network_by_force.py] In function delete_edge_files. Check file %s. It is probably still being written (age less than %d hours). So I don\'t delete it.' % (fname, age_in_hours), UPDATE_NETWORK_LOG_FILE)
+
+########## Merge edges #######################
+# update edges.txt, a merged file from two sources, HISTORY_DIR and HISTORY_DIR2. Some new edge files are being generated ...
+time.sleep(3)
+edge_file_lst = [] # collect edge files.
+most_recent_edge_modification_time = 0
+write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (HISTORY_DIR), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR, 'edges.txt.*')): # many small edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+write_log_file('[update_network_by_force.py] Look at edge files in %s.' % (HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+for fname in glob.glob(os.path.join(HISTORY_DIR2, 'edges.txt.*')): # edges.txt.* are to be merged
+ edge_file_lst.append(fname)
+ if os.path.getmtime(fname) > most_recent_edge_modification_time:
+ most_recent_edge_modification_time = os.path.getmtime(fname)
+
+
+if edge_file_lst == []:
+ write_log_file('[update_network_by_force.py] No edge files to merge in %s and %s.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+elif os.path.getmtime(MERGED_EDGE_FILE) < most_recent_edge_modification_time: # update edges.txt only if there are newer edges to add.
+ # concatenate edge files into one
+ write_log_file('[update_network_by_force.py] Concatenate edge files in %s and %s into one file.' % (HISTORY_DIR, HISTORY_DIR2), UPDATE_NETWORK_LOG_FILE)
+ curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+ concatenate_edge_files(edge_file_lst, os.path.join(EDGE_POOL_DIR, 'edges.txt.many.one.targets.' + curr_time))
+ delete_edge_files(edge_file_lst)
+
+if os.path.getmtime(MERGED_EDGE_FILE) < os.path.getmtime(EDGE_POOL_DIR): # edge pool directory has been updated, create new edges.txt
+ write_log_file('[update_network_by_force.py] Make a new edges.txt from edge files in %s.' % (EDGE_POOL_DIR), UPDATE_NETWORK_LOG_FILE)
+ write_log_file('[update_network_by_force.py] Number of lines in the old edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ cmd = 'python3 merge_edges.py'
+ os.system(cmd)
+ write_log_file('[update_network_by_force.py] Number of lines in the new edges.txt: %d.' % (num_line(MERGED_EDGE_FILE)), UPDATE_NETWORK_LOG_FILE)
+ manual_copy_commands = 'Please copy files to the web application: sudo cp /home/lanhui/brain/Data/temp/edges.txt /var/www/brain/brain/static/edges/edges.txt sudo find /home/lanhui/brain/Data/temp/html_edges -name "*.html" -exec mv -t /var/www/brain/brain/static/edges {} +'
+ write_log_file('[update_network_by_force.py] %s' % (manual_copy_commands), UPDATE_NETWORK_LOG_FILE)
+
+
+
+write_log_file('[update_network_by_force.py] Update done at %s.\n\n' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S')), UPDATE_NETWORK_LOG_FILE)
diff --git a/Code/update_rnaseq_info_json.py b/Code/update_rnaseq_info_json.py
new file mode 100644
index 0000000..4d6b654
--- /dev/null
+++ b/Code/update_rnaseq_info_json.py
@@ -0,0 +1,89 @@
+# Usage: python update_rnaseq_info_json.py
+# Provide two files old_json and tissue_file
+#
+# Purpose: update the tissue field in rnaseq_info_database.json. Make
+# Data/information/experiment.and.tissue.txt in which rnaseq samples
+# with unknown tissues are predicted using knn_classify.R with K=1.
+#
+# 2 June 2017, slcu, hui
+# Last modified 19 June 2017, slcu, hui
+
+import json, os, sys
+
+def get_sra_id(x):
+ if 'RR' in x:
+ index1 = x.find('RR')
+ index2 = x.find('X')
+ if index2 == -1:
+ index2 = len(x)
+ return x[index1-1:index2]
+ return x
+
+def make_tissue_dict(fname):
+ f = open(fname)
+ lines = f.readlines()
+ d = {}
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ x = lst[0]
+ y = get_sra_id(x)
+ d[y] = lst[4]
+ f.close()
+ return d
+
+def update_tissue_dict_and_tissue_file(d, fname, fname_pred):
+
+ f = open(fname_pred) # predicted file, columns are sample.name and predicted.tissue
+ lines = f.readlines()
+ f.close()
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '' and len(lst) >= 2:
+ y = get_sra_id(lst[0])
+ d[y] = lst[1]
+
+ f = open(fname)
+ lines = f.readlines()
+ head_line = lines[0].strip()
+ f.close()
+ file_lines = [head_line]
+ for line in lines[1:]:
+ line = line.strip()
+ lst = line.split('\t')
+ if line != '' and len(lst) >= 5:
+ k = get_sra_id(lst[0])
+ if lst[4] == 'unknown' and k in d:
+ lst[4] = d[k]
+ file_lines.append('\t'.join(lst))
+ outfile = '../Data/information/experiment.and.tissue.txt' # so that outfile dose not contain unknown
+ f = open(outfile, 'w')
+ f.write('\n'.join(file_lines) + '\n')
+ f.close()
+ return d
+
+
+# main
+RSCRIPT_FILE = 'knn_classify.R'
+old_json = '../Data/information/rnaseq_info_database.json' # generated by parse_xml.py
+tissue_file = '../Data/information/experiment.and.tissue.2.txt' # generated by refine_tissue.py
+tissue_dict = make_tissue_dict(tissue_file)
+if os.path.exists(RSCRIPT_FILE):
+ cmd = 'Rscript %s' % (RSCRIPT_FILE) # generate ../Data/temp/predicted.label.txt
+ os.system(cmd)
+ tissue_dict = update_tissue_dict_and_tissue_file(tissue_dict, tissue_file, '../Data/temp/predicted.label.txt')
+
+with open(old_json) as json_data:
+ json_dict = json.load(json_data)
+ for k in json_dict:
+ if k in tissue_dict:
+ json_dict[k]['tissue'] = tissue_dict[k]
+
+cmd = 'cp %s ../Data/information/rnaseq_info_database.json.old' % (old_json)
+os.system(cmd)
+fname = old_json
+with open(fname, 'w') as f:
+ json.dump(json_dict, f, indent=4)
+
+print('Check updated %s.' % (old_json))
diff --git a/Code/validate_parameter_for_buildCmatrix.py b/Code/validate_parameter_for_buildCmatrix.py
new file mode 100644
index 0000000..ced6062
--- /dev/null
+++ b/Code/validate_parameter_for_buildCmatrix.py
@@ -0,0 +1,85 @@
+# Usage: python validate_parameter_for_buildCmatrix.py
+# Purpose: make sure all files exist.
+# Hui 24 Jan 2018 Jinhua
+
+import os, sys
+import numpy as np
+import glob
+import time
+import subprocess
+from datetime import datetime
+
+def get_value(s, delimit):
+ lst = s.split(delimit, 1) # only split at the first delimit
+ return lst[1].strip()
+
+def get_key_value(s):
+ lst = s.split('=')
+ k, v = lst[0], lst[1]
+ return (k.strip(), v.strip())
+
+def validate_gene_file(fname):
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ for line in lines: # check all lines
+ line = line.strip()
+ lst = line.split('\t')
+ if len(lst) < 6:
+ print('Not enought fields: %s. Only %d are given. Each line must have gene_id, gene_name, chr, start, end, strand, description (optional). See prepare_gene_file.py in the documentation on how to prepare this file.' % (line, len(lst)))
+ sys.exit()
+
+def validate_parameter_for_buildcmatrix(fname):
+ # first the file must exist
+ if not os.path.exists(fname):
+ print('CANNOT FIND %s.' % (fname))
+ sys.exit()
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+ d = {}
+ location_count = 0
+ for line in lines:
+ line = line.strip()
+ if line.startswith('%%'):
+ k, v = get_key_value(line[2:])
+ d[k] = v
+ if k == 'GENE_FILE' or k == 'CHR_INFO':
+ if not os.path.exists(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'GENE_FILE':
+ validate_gene_file(v)
+ if k == 'DESTINATION':
+ if not os.path.isdir(v):
+ print('%s not exists.' % (v))
+ sys.exit()
+ if k == 'TARGET_RANGE':
+ if int(v) <= 0:
+ print('Target range (%d) must be greater than 0.' % (v))
+ sys.exit()
+ if line.startswith('LOCATION:'):
+ v = get_value(line, ':')
+ location_count += 1
+ if not os.path.exists(v):
+ print('Location %s does not exists.' % (v))
+ #sys.exit()
+
+ if not 'GENE_FILE' in d:
+ print('Must specify GENE_FILE.')
+ sys.exit()
+ if not 'DESTINATION' in d:
+ print('Must specify DESTINATION.')
+ sys.exit()
+ if not 'CHR_INFO' in d:
+ print('Must specify CHR_INFO.')
+ sys.exit()
+ if location_count == 0:
+ print('Must contain at least one ChIP-seq.')
+ sys.exit()
+
+## main
+
+PARAMETER_FOR_BUILDCMATRIX = '../Data/parameter/parameter_for_buildCmatrix.txt'
+validate_parameter_for_buildcmatrix(PARAMETER_FOR_BUILDCMATRIX)
+
diff --git a/Code/wedge.R b/Code/wedge.R
new file mode 100644
index 0000000..50039eb
--- /dev/null
+++ b/Code/wedge.R
@@ -0,0 +1,138 @@
+# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua
+#DATA.FILE <- '../Data/history/expr/TPM.txt.3130'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TARGET.TF.FILE <- '../Data/information/target_tf.txt'
+AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt'
+ONE.TARGET.DIR <- '../Data/history/edges/one_target'
+
+# Make sure we have required files and directory
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE))
+}
+
+if (! file.exists(TARGET.TF.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE))
+}
+
+if (! file.exists(AGINAME.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE))
+}
+
+
+if (! dir.exists(ONE.TARGET.DIR)) {
+ stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR))
+}
+
+
+r.tau <- 0.60
+
+cat(sprintf('Read %s\n', DATA.FILE))
+X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+all.genes <- rownames(X)
+
+cat(sprintf('Read %s\n', AGINAME.FILE))
+#agi <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+
+cat(sprintf('Read %s\n', TARGET.TF.FILE))
+target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
+total.pair <- dim(target.tf)[1]
+
+###########################################################################
+post.translation.4 <- function(x, y) {
+ mx = mean(x)
+ index = (x > mx - 0.5) & (x < mx + 0.5)
+ slope = max(y[index])/mx
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index0 = which(z <= 0) # points below the wedge
+ index1 = which(z > 0) # points above the wedge
+ index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too
+ if (length(index2) > 0) {
+ q = quantile(y[index2], 0.9)
+ m = mean(y[index2])
+ } else {
+ q = 0.0
+ m = 0.0
+ }
+ index3 = which(x < 1)
+ if (length(index3) > 0) {
+ m = mean(y[index3])
+ } else {
+ m = 0.0
+ }
+ # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and
+ # m < slope, disp.x < disp.y
+ result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y))
+}
+
+
+make.data <- function(slope, n) {
+ x=abs(3.0 + 1*rnorm(n))
+ y=abs(3.0 + 1*rnorm(n))
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index = which(z <= 0)
+ result <- list(x=x[index], y=y[index])
+}
+###########################################################################
+
+cat(sprintf('Go through pairs looking for wedge shapes ..\n'))
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:total.pair) {
+ id1 <- as.vector(target.tf[i,2]) # tf
+ id2 <- as.vector(target.tf[i,1]) # target
+
+ all.in <- id1 %in% all.genes & id2 %in% all.genes
+ if (!all.in) {
+ next
+ }
+
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ x <- t(x)
+ y <- t(y)
+ na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y))
+ index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included
+ x <- x[!index, 1, drop=FALSE]
+ y <- y[!index, 1, drop=FALSE]
+
+ if (dim(x)[1] < 50) {
+ next
+ }
+
+ # We will not consider wedge shape if the correlation coefficient is large enough.
+ if (abs(cor(x, y)) < r.tau) {
+ result <- post.translation.4(x, y)
+ if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) {
+ #name1 <- agi[id1,1]
+ #name2 <- agi[id2,1]
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+ max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m)))
+ curr.date <- gsub('-','',Sys.Date())
+ loglik <- '-1001.0'
+ rna.sample <- row.names(x)[result$below] # below the diagonal line
+ #rna.sample.size <- length(rna.sample)
+ #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs
+ sub.cond <- paste(rna.sample, collapse=' ')
+ sub.cond.length <- length(rna.sample)
+ cond <- as.vector(target.tf[i,3])
+ result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge')
+
+ cat(result2, file=f, sep='')
+ }
+ }
+}
+
+close(f)