diff options
Diffstat (limited to 'Code/knn_classify.R')
-rw-r--r-- | Code/knn_classify.R | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/Code/knn_classify.R b/Code/knn_classify.R new file mode 100644 index 0000000..46df992 --- /dev/null +++ b/Code/knn_classify.R @@ -0,0 +1,79 @@ +## Usage: change file names in the section # Paramters +## Purpose: Classify tissues using KNN. Use tSNE to reduce the dimensionality of each tissue to 2 first. +## +## 7 June 2017, slcu, hui +## Last modified 20 June 2017, slcu, hui + +# Parameters +TRAIN_DATA_FILE <- '../Data/history/expr/TPM.txt' +TRAIN_CLASS_FILE <- '../Data/information/experiment.and.tissue.2.txt' + +K <- 1 +PERPLEXITY <- 50 # for tSNE + +# Load data +#cat('Load TPM.txt ...\n') +X <- read.table(TRAIN_DATA_FILE, header=TRUE, check.names=FALSE) +all.id <- X$gene_id +X$gene_id <- NULL # remove column gene_id +row.names(X) <- all.id # add row names + +Z <- read.table(TRAIN_CLASS_FILE, header=TRUE, check.names=FALSE, sep='\t') +labels <- as.vector(Z$suggested.tissue) + +unknown.index <- which(labels == "unknown") # remove unknowns + +X.2 <- X[, unknown.index] # test data (label unknown) +labels <- labels[-unknown.index] +X <- X[, -unknown.index] + + +labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories +sul <- sort(unique(labels)) # sorted unique labels +colors <- rainbow(length(sul)) +names(colors) <- sul + +# Filter rows +#cat('Filter ...\n') +rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average +sd.val <- apply(X, 1, sd) +sd.tau <- summary(sd.val)[3] # genes whose gene expression varies least are to be filtered +index <- rowSums(X) > rowsum.tau & sd.val > 10 + +n.train <- dim(X)[2] +X.3 <- log(cbind(X[index,], X.2[index,]) + 1) +n.test <- dim(X.2)[2] +n <- dim(X.3)[2] + +# Learn +library(Rtsne) +library(class) +set.seed(100) +#cat('Dimensionality reduction using tSNE ...\n') +tsne <- Rtsne(t(X.3), check_duplicates=F, dims=2, perplexity=PERPLEXITY, theta=0.5, verbose=FALSE, max_iter=600) # dimensionality reduction +train.data <- cbind(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2]) + +# Train and test on the same data +cl <- factor(labels) # class labels +result <- knn(train.data, train.data, cl, k=K, prob=TRUE) +#cat(sprintf('Training accuracy: %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl))) + +# Cross-validation on training data +result <- knn.cv(train.data, cl, k=K, prob = TRUE) +#cat(sprintf('Test accuracy (leave-one-out cross-validation): %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl))) + +# If test data is available, make prediction. +test.data <- cbind(tsne$Y[(n.train+1):n,1], tsne$Y[(n.train+1):n,2]) +result <- knn(train.data, test.data, cl, k=K, prob=TRUE) +df <- data.frame(sample.name=colnames(X.2), predicted.tissue=result) +write.table(df, '../Data/temp/predicted.label.txt', quote=F, sep='\t', row.names=F) + +# Plot +#pdf('../Data/temp/fig.pdf') +#par(mar=c(5.1, 4.1, 4.1, 8.8), xpd=TRUE) +#plot(tsne$Y, xlab='tsne X', ylab='tsne Y', cex=.4, col='grey') +#points(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2], col=colors[labels], cex=.4) +##text(tsne$Y[,1], tsne$Y[,2], labels, cex=0.1) +#legend("topright", inset=c(-0.4,0), sul, col=colors, pch=16) +#dev.off() + |