# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua #DATA.FILE <- '../Data/history/expr/TPM.txt.3130' DATA.FILE <- '../Data/history/expr/TPM.txt' TARGET.TF.FILE <- '../Data/information/target_tf.txt' AGINAME.FILE <- '../Data/information/AGI-to-gene-names_v2.txt' ONE.TARGET.DIR <- '../Data/history/edges/one_target' # Make sure we have required files and directory if (! file.exists(DATA.FILE)) { stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE)) } if (! file.exists(TARGET.TF.FILE)) { stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE)) } if (! file.exists(AGINAME.FILE)) { stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE)) } if (! dir.exists(ONE.TARGET.DIR)) { stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR)) } r.tau <- 0.60 cat(sprintf('Read %s\n', DATA.FILE)) X <- read.table(DATA.FILE, header=TRUE, check.names=FALSE) all.id <- X$gene_id X$gene_id <- NULL # remove column gene_id row.names(X) <- all.id # add row names all.genes <- rownames(X) cat(sprintf('Read %s\n', AGINAME.FILE)) #agi <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes agi <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes cat(sprintf('Read %s\n', TARGET.TF.FILE)) target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t') total.pair <- dim(target.tf)[1] ########################################################################### post.translation.4 <- function(x, y) { mx = mean(x) index = (x > mx - 0.5) & (x < mx + 0.5) slope = max(y[index])/mx v = c(-slope, 1) xy = as.matrix(cbind(x,y)) z = xy %*% v index0 = which(z <= 0) # points below the wedge index1 = which(z > 0) # points above the wedge index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too if (length(index2) > 0) { q = quantile(y[index2], 0.9) m = mean(y[index2]) } else { q = 0.0 m = 0.0 } index3 = which(x < 1) if (length(index3) > 0) { m = mean(y[index3]) } else { m = 0.0 } # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and # m < slope, disp.x < disp.y result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y)) } make.data <- function(slope, n) { x=abs(3.0 + 1*rnorm(n)) y=abs(3.0 + 1*rnorm(n)) v = c(-slope, 1) xy = as.matrix(cbind(x,y)) z = xy %*% v index = which(z <= 0) result <- list(x=x[index], y=y[index]) } ########################################################################### cat(sprintf('Go through pairs looking for wedge shapes ..\n')) output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.') f <- file(output.file, 'w') for (i in 1:total.pair) { id1 <- as.vector(target.tf[i,2]) # tf id2 <- as.vector(target.tf[i,1]) # target all.in <- id1 %in% all.genes & id2 %in% all.genes if (!all.in) { next } x <- X[id1,] y <- X[id2,] x <- log(x+1) y <- log(y+1) x <- t(x) y <- t(y) na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y)) index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included x <- x[!index, 1, drop=FALSE] y <- y[!index, 1, drop=FALSE] if (dim(x)[1] < 50) { next } # We will not consider wedge shape if the correlation coefficient is large enough. if (abs(cor(x, y)) < r.tau) { result <- post.translation.4(x, y) if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) { #name1 <- agi[id1,1] #name2 <- agi[id2,1] name1 <- agi$V2[which(agi$V1 == id1)] name2 <- agi$V2[which(agi$V1 == id2)] max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m))) curr.date <- gsub('-','',Sys.Date()) loglik <- '-1001.0' rna.sample <- row.names(x)[result$below] # below the diagonal line #rna.sample.size <- length(rna.sample) #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs sub.cond <- paste(rna.sample, collapse=' ') sub.cond.length <- length(rna.sample) cond <- as.vector(target.tf[i,3]) result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge') cat(result2, file=f, sep='') } } } close(f)