summaryrefslogtreecommitdiff
path: root/Code/knn_classify.R
diff options
context:
space:
mode:
Diffstat (limited to 'Code/knn_classify.R')
-rw-r--r--Code/knn_classify.R79
1 files changed, 79 insertions, 0 deletions
diff --git a/Code/knn_classify.R b/Code/knn_classify.R
new file mode 100644
index 0000000..46df992
--- /dev/null
+++ b/Code/knn_classify.R
@@ -0,0 +1,79 @@
+## Usage: change file names in the section # Paramters
+## Purpose: Classify tissues using KNN. Use tSNE to reduce the dimensionality of each tissue to 2 first.
+##
+## 7 June 2017, slcu, hui
+## Last modified 20 June 2017, slcu, hui
+
+# Parameters
+TRAIN_DATA_FILE <- '../Data/history/expr/TPM.txt'
+TRAIN_CLASS_FILE <- '../Data/information/experiment.and.tissue.2.txt'
+
+K <- 1
+PERPLEXITY <- 50 # for tSNE
+
+# Load data
+#cat('Load TPM.txt ...\n')
+X <- read.table(TRAIN_DATA_FILE, header=TRUE, check.names=FALSE)
+all.id <- X$gene_id
+X$gene_id <- NULL # remove column gene_id
+row.names(X) <- all.id # add row names
+
+Z <- read.table(TRAIN_CLASS_FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels <- as.vector(Z$suggested.tissue)
+
+unknown.index <- which(labels == "unknown") # remove unknowns
+
+X.2 <- X[, unknown.index] # test data (label unknown)
+labels <- labels[-unknown.index]
+X <- X[, -unknown.index]
+
+
+labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+sul <- sort(unique(labels)) # sorted unique labels
+colors <- rainbow(length(sul))
+names(colors) <- sul
+
+# Filter rows
+#cat('Filter ...\n')
+rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average
+sd.val <- apply(X, 1, sd)
+sd.tau <- summary(sd.val)[3] # genes whose gene expression varies least are to be filtered
+index <- rowSums(X) > rowsum.tau & sd.val > 10
+
+n.train <- dim(X)[2]
+X.3 <- log(cbind(X[index,], X.2[index,]) + 1)
+n.test <- dim(X.2)[2]
+n <- dim(X.3)[2]
+
+# Learn
+library(Rtsne)
+library(class)
+set.seed(100)
+#cat('Dimensionality reduction using tSNE ...\n')
+tsne <- Rtsne(t(X.3), check_duplicates=F, dims=2, perplexity=PERPLEXITY, theta=0.5, verbose=FALSE, max_iter=600) # dimensionality reduction
+train.data <- cbind(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2])
+
+# Train and test on the same data
+cl <- factor(labels) # class labels
+result <- knn(train.data, train.data, cl, k=K, prob=TRUE)
+#cat(sprintf('Training accuracy: %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# Cross-validation on training data
+result <- knn.cv(train.data, cl, k=K, prob = TRUE)
+#cat(sprintf('Test accuracy (leave-one-out cross-validation): %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl)))
+
+# If test data is available, make prediction.
+test.data <- cbind(tsne$Y[(n.train+1):n,1], tsne$Y[(n.train+1):n,2])
+result <- knn(train.data, test.data, cl, k=K, prob=TRUE)
+df <- data.frame(sample.name=colnames(X.2), predicted.tissue=result)
+write.table(df, '../Data/temp/predicted.label.txt', quote=F, sep='\t', row.names=F)
+
+# Plot
+#pdf('../Data/temp/fig.pdf')
+#par(mar=c(5.1, 4.1, 4.1, 8.8), xpd=TRUE)
+#plot(tsne$Y, xlab='tsne X', ylab='tsne Y', cex=.4, col='grey')
+#points(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2], col=colors[labels], cex=.4)
+##text(tsne$Y[,1], tsne$Y[,2], labels, cex=0.1)
+#legend("topright", inset=c(-0.4,0), sul, col=colors, pch=16)
+#dev.off()
+