From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/wedge.R | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 Code/wedge.R (limited to 'Code/wedge.R') diff --git a/Code/wedge.R b/Code/wedge.R new file mode 100644 index 0000000..50039eb --- /dev/null +++ b/Code/wedge.R @@ -0,0 +1,138 @@ +# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua +#DATA.FILE <- '../Data/history/expr/TPM.txt.3130' +DATA.FILE <- '../Data/history/expr/TPM.txt' +TARGET.TF.FILE <- '../Data/information/target_tf.txt' +AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt' +ONE.TARGET.DIR <- '../Data/history/edges/one_target' + +# Make sure we have required files and directory +if (! file.exists(DATA.FILE)) { + stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE)) +} + +if (! file.exists(TARGET.TF.FILE)) { + stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE)) +} + +if (! file.exists(AGINAME.FILE)) { + stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE)) +} + + +if (! dir.exists(ONE.TARGET.DIR)) { + stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR)) +} + + +r.tau <- 0.60 + +cat(sprintf('Read %s\n', DATA.FILE)) +X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE) +all.id <- X$gene_id +X$gene_id <- NULL # remove column gene_id +row.names(X) <- all.id # add row names +all.genes <- rownames(X) + +cat(sprintf('Read %s\n', AGINAME.FILE)) +#agi <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes +agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes + +cat(sprintf('Read %s\n', TARGET.TF.FILE)) +target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t') +total.pair <- dim(target.tf)[1] + +########################################################################### +post.translation.4 <- function(x, y) { + mx = mean(x) + index = (x > mx - 0.5) & (x < mx + 0.5) + slope = max(y[index])/mx + v = c(-slope, 1) + xy = as.matrix(cbind(x,y)) + z = xy %*% v + index0 = which(z <= 0) # points below the wedge + index1 = which(z > 0) # points above the wedge + index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too + if (length(index2) > 0) { + q = quantile(y[index2], 0.9) + m = mean(y[index2]) + } else { + q = 0.0 + m = 0.0 + } + index3 = which(x < 1) + if (length(index3) > 0) { + m = mean(y[index3]) + } else { + m = 0.0 + } + # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and + # m < slope, disp.x < disp.y + result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y)) +} + + +make.data <- function(slope, n) { + x=abs(3.0 + 1*rnorm(n)) + y=abs(3.0 + 1*rnorm(n)) + v = c(-slope, 1) + xy = as.matrix(cbind(x,y)) + z = xy %*% v + index = which(z <= 0) + result <- list(x=x[index], y=y[index]) +} +########################################################################### + +cat(sprintf('Go through pairs looking for wedge shapes ..\n')) + +output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.') +f <- file(output.file, 'w') + +for (i in 1:total.pair) { + id1 <- as.vector(target.tf[i,2]) # tf + id2 <- as.vector(target.tf[i,1]) # target + + all.in <- id1 %in% all.genes & id2 %in% all.genes + if (!all.in) { + next + } + + x <- X[id1,] + y <- X[id2,] + x <- log(x+1) + y <- log(y+1) + x <- t(x) + y <- t(y) + na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y)) + index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included + x <- x[!index, 1, drop=FALSE] + y <- y[!index, 1, drop=FALSE] + + if (dim(x)[1] < 50) { + next + } + + # We will not consider wedge shape if the correlation coefficient is large enough. + if (abs(cor(x, y)) < r.tau) { + result <- post.translation.4(x, y) + if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) { + #name1 <- agi[id1,1] + #name2 <- agi[id2,1] + name1 <- agi$V2[which(agi$V1 == id1)] + name2 <- agi$V2[which(agi$V1 == id2)] + max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m))) + curr.date <- gsub('-','',Sys.Date()) + loglik <- '-1001.0' + rna.sample <- row.names(x)[result$below] # below the diagonal line + #rna.sample.size <- length(rna.sample) + #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs + sub.cond <- paste(rna.sample, collapse=' ') + sub.cond.length <- length(rna.sample) + cond <- as.vector(target.tf[i,3]) + result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge') + + cat(result2, file=f, sep='') + } + } +} + +close(f) -- cgit v1.2.1