Code/create_edges.r


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

TARGET_TF_FILE <- "../Data/information/target_tf.txt"
DATA_FILE      <- "../Data/history/expr/TPM.txt" # A TPM table
AGINAME_FILE   <- "../Data/information/AGI-to-gene-names_v2.txt"
CORR_THRESHOLD     <- 0.6
MIN_SIZE           <- 100

####### Read data #########################################
X             <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
gene_id       <- X$gene_id
X$gene_id     <- NULL
row.names(X)  <- gene_id
X             <- as.matrix(X)
rna.sample.id <- colnames(X)

target_tf  <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
target_tf  <- as.matrix(target_tf)
targets    <- target_tf[,1]
tfs        <- target_tf[,2]
conditions <- target_tf[,3]

agi        <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
#######################################################

library(mixtools)
result.file <- '../Data/history/edges/many_targets/edges.txt.20170306_1015'
for (i in 1:length(targets)) {
    out <- file(result.file, 'a')
    curr.date <- gsub('-','',Sys.Date())
    id1 <- tfs[i]
    id2 <- targets[i]
    if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
        next
    }    
    name1 <- agi[id1,1]
    name2 <- agi[id2,1]
    cond <- conditions[i]
    x <- X[id1,]
    y <- X[id2,]
    x <- log(x+1)
    y <- log(y+1)
    index <- x < 0.01 | y < 0.01
    x <- x[!index]
    y <- y[!index]
    r <- cor(x, y)
    if (abs(r) >= CORR_THRESHOLD) {
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
        #cat(s, file=result.file, sep='\n', append=T)
	writeLines(s, con=out)
        next
    }

    k <- 3
    N <- length(x)
    em.out <- regmixEM(y,x,maxit=100, epsilon=1e-04, k=k)

    pos_r_max   <- -2
    pos_r_N     <- 0
    pos_r_index <- c()
    pos_r_loglik <- -100000000

    neg_r_max   <- 2
    neg_r_N     <- 0
    neg_r_index <- c()
    neg_r_loglik <- -100000000

    for (j in seq(1,k,1)) {

        index <- which(max.col(em.out$posterior) == j)
        size <- length(index)
        r <- cor(em.out$x[index,2], em.out$y[index])

        if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
            pos_r_max <- r
            pos_r_N   <- size
            pos_r_index <- index
            pos_r_loglik <- em.out$loglik
        }
        if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
            neg_r_max <- r
            neg_r_N   <- size
            neg_r_index <- index
            neg_r_loglik <- em.out$loglik
        }
    }

    if (pos_r_max > 0) {
        sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
        #cat(s, file=result.file, sep='\n', append=T)
	writeLines(s, con=out)	
    }
    if (neg_r_max < 0) {
        sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\tloglik=%4.2f\t%s', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
        #cat(s, file=result.file, sep='\n', append=T)
	writeLines(s, con=out)	
    }
    close(out)    
}