1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
TARGET_TF_FILE <- "../Data/information/target_tf.txt"
DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table
AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt"
CORR_THRESHOLD <- 0.6
MIN_SIZE <- 100
####### Read data #########################################
X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
gene_id <- X$gene_id
X$gene_id <- NULL
row.names(X) <- gene_id
X <- as.matrix(X)
rna.sample.id <- colnames(X)
target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
target_tf <- as.matrix(target_tf)
targets <- target_tf[,1]
tfs <- target_tf[,2]
conditions <- target_tf[,3]
agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
#######################################################
library(mixtools)
result.file <- '../Data/history/edges/many_targets/edges.txt.20170306_1015'
for (i in 1:length(targets)) {
out <- file(result.file, 'a')
curr.date <- gsub('-','',Sys.Date())
id1 <- tfs[i]
id2 <- targets[i]
if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
next
}
name1 <- agi[id1,1]
name2 <- agi[id2,1]
cond <- conditions[i]
x <- X[id1,]
y <- X[id2,]
x <- log(x+1)
y <- log(y+1)
index <- x < 0.01 | y < 0.01
x <- x[!index]
y <- y[!index]
r <- cor(x, y)
if (abs(r) >= CORR_THRESHOLD) {
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
#cat(s, file=result.file, sep='\n', append=T)
writeLines(s, con=out)
next
}
k <- 3
N <- length(x)
em.out <- regmixEM(y,x,maxit=100, epsilon=1e-04, k=k)
pos_r_max <- -2
pos_r_N <- 0
pos_r_index <- c()
pos_r_loglik <- -100000000
neg_r_max <- 2
neg_r_N <- 0
neg_r_index <- c()
neg_r_loglik <- -100000000
for (j in seq(1,k,1)) {
index <- which(max.col(em.out$posterior) == j)
size <- length(index)
r <- cor(em.out$x[index,2], em.out$y[index])
if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
pos_r_max <- r
pos_r_N <- size
pos_r_index <- index
pos_r_loglik <- em.out$loglik
}
if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
neg_r_max <- r
neg_r_N <- size
neg_r_index <- index
neg_r_loglik <- em.out$loglik
}
}
if (pos_r_max > 0) {
sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
#cat(s, file=result.file, sep='\n', append=T)
writeLines(s, con=out)
}
if (neg_r_max < 0) {
sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
#cat(s, file=result.file, sep='\n', append=T)
writeLines(s, con=out)
}
close(out)
}
|