summaryrefslogtreecommitdiff
path: root/Code/wedge.R
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/wedge.R
brain: add python and R code to local repository.
Diffstat (limited to 'Code/wedge.R')
-rw-r--r--Code/wedge.R138
1 files changed, 138 insertions, 0 deletions
diff --git a/Code/wedge.R b/Code/wedge.R
new file mode 100644
index 0000000..50039eb
--- /dev/null
+++ b/Code/wedge.R
@@ -0,0 +1,138 @@
+# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua
+#DATA.FILE <- '../Data/history/expr/TPM.txt.3130'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TARGET.TF.FILE <- '../Data/information/target_tf.txt'
+AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt'
+ONE.TARGET.DIR <- '../Data/history/edges/one_target'
+
+# Make sure we have required files and directory
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE))
+}
+
+if (! file.exists(TARGET.TF.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE))
+}
+
+if (! file.exists(AGINAME.FILE)) {
+ stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE))
+}
+
+
+if (! dir.exists(ONE.TARGET.DIR)) {
+ stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR))
+}
+
+
+r.tau <- 0.60
+
+cat(sprintf('Read %s\n', DATA.FILE))
+X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+all.genes <- rownames(X)
+
+cat(sprintf('Read %s\n', AGINAME.FILE))
+#agi <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+
+cat(sprintf('Read %s\n', TARGET.TF.FILE))
+target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
+total.pair <- dim(target.tf)[1]
+
+###########################################################################
+post.translation.4 <- function(x, y) {
+ mx = mean(x)
+ index = (x > mx - 0.5) & (x < mx + 0.5)
+ slope = max(y[index])/mx
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index0 = which(z <= 0) # points below the wedge
+ index1 = which(z > 0) # points above the wedge
+ index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too
+ if (length(index2) > 0) {
+ q = quantile(y[index2], 0.9)
+ m = mean(y[index2])
+ } else {
+ q = 0.0
+ m = 0.0
+ }
+ index3 = which(x < 1)
+ if (length(index3) > 0) {
+ m = mean(y[index3])
+ } else {
+ m = 0.0
+ }
+ # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and
+ # m < slope, disp.x < disp.y
+ result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y))
+}
+
+
+make.data <- function(slope, n) {
+ x=abs(3.0 + 1*rnorm(n))
+ y=abs(3.0 + 1*rnorm(n))
+ v = c(-slope, 1)
+ xy = as.matrix(cbind(x,y))
+ z = xy %*% v
+ index = which(z <= 0)
+ result <- list(x=x[index], y=y[index])
+}
+###########################################################################
+
+cat(sprintf('Go through pairs looking for wedge shapes ..\n'))
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:total.pair) {
+ id1 <- as.vector(target.tf[i,2]) # tf
+ id2 <- as.vector(target.tf[i,1]) # target
+
+ all.in <- id1 %in% all.genes & id2 %in% all.genes
+ if (!all.in) {
+ next
+ }
+
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ x <- t(x)
+ y <- t(y)
+ na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y))
+ index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included
+ x <- x[!index, 1, drop=FALSE]
+ y <- y[!index, 1, drop=FALSE]
+
+ if (dim(x)[1] < 50) {
+ next
+ }
+
+ # We will not consider wedge shape if the correlation coefficient is large enough.
+ if (abs(cor(x, y)) < r.tau) {
+ result <- post.translation.4(x, y)
+ if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) {
+ #name1 <- agi[id1,1]
+ #name2 <- agi[id2,1]
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+ max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m)))
+ curr.date <- gsub('-','',Sys.Date())
+ loglik <- '-1001.0'
+ rna.sample <- row.names(x)[result$below] # below the diagonal line
+ #rna.sample.size <- length(rna.sample)
+ #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs
+ sub.cond <- paste(rna.sample, collapse=' ')
+ sub.cond.length <- length(rna.sample)
+ cond <- as.vector(target.tf[i,3])
+ result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge')
+
+ cat(result2, file=f, sep='')
+ }
+ }
+}
+
+close(f)