## Usage: change file names in the section # Paramters ## Purpose: Classify tissues using KNN. Use tSNE to reduce the dimensionality of each tissue to 2 first. ## ## 7 June 2017, slcu, hui ## Last modified 20 June 2017, slcu, hui # Parameters TRAIN_DATA_FILE <- '../Data/history/expr/TPM.txt' TRAIN_CLASS_FILE <- '../Data/information/experiment.and.tissue.2.txt' K <- 1 PERPLEXITY <- 50 # for tSNE # Load data #cat('Load TPM.txt ...\n') X <- read.table(TRAIN_DATA_FILE, header=TRUE, check.names=FALSE) all.id <- X$gene_id X$gene_id <- NULL # remove column gene_id row.names(X) <- all.id # add row names Z <- read.table(TRAIN_CLASS_FILE, header=TRUE, check.names=FALSE, sep='\t') labels <- as.vector(Z$suggested.tissue) unknown.index <- which(labels == "unknown") # remove unknowns X.2 <- X[, unknown.index] # test data (label unknown) labels <- labels[-unknown.index] X <- X[, -unknown.index] labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories sul <- sort(unique(labels)) # sorted unique labels colors <- rainbow(length(sul)) names(colors) <- sul # Filter rows #cat('Filter ...\n') rowsum.tau <- dim(X)[2] # the gene's TPM value is at least 1 on average sd.val <- apply(X, 1, sd) sd.tau <- summary(sd.val)[3] # genes whose gene expression varies least are to be filtered index <- rowSums(X) > rowsum.tau & sd.val > 10 index[is.na(index)] <- FALSE n.train <- dim(X)[2] X.3 <- log(cbind(X[index,], X.2[index,]) + 1) n.test <- dim(X.2)[2] n <- dim(X.3)[2] # Learn library(Rtsne) library(class) set.seed(100) #cat('Dimensionality reduction using tSNE ...\n') tsne <- Rtsne(t(X.3), check_duplicates=F, dims=2, perplexity=PERPLEXITY, theta=0.5, verbose=FALSE, max_iter=600) # dimensionality reduction train.data <- cbind(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2]) # Train and test on the same data cl <- factor(labels) # class labels result <- knn(train.data, train.data, cl, k=K, prob=TRUE) #cat(sprintf('Training accuracy: %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl))) # Cross-validation on training data result <- knn.cv(train.data, cl, k=K, prob = TRUE) #cat(sprintf('Test accuracy (leave-one-out cross-validation): %4.3f.\n', sum(as.vector(cl) == as.vector(result))/length(cl))) # If test data is available, make prediction. test.data <- cbind(tsne$Y[(n.train+1):n,1], tsne$Y[(n.train+1):n,2]) result <- knn(train.data, test.data, cl, k=K, prob=TRUE) df <- data.frame(sample.name=colnames(X.2), predicted.tissue=result) write.table(df, '../Data/temp/predicted.label.txt', quote=F, sep='\t', row.names=F) # Plot #pdf('../Data/temp/fig.pdf') #par(mar=c(5.1, 4.1, 4.1, 8.8), xpd=TRUE) #plot(tsne$Y, xlab='tsne X', ylab='tsne Y', cex=.4, col='grey') #points(tsne$Y[1:n.train,1], tsne$Y[1:n.train,2], col=colors[labels], cex=.4) ##text(tsne$Y[,1], tsne$Y[,2], labels, cex=0.1) #legend("topright", inset=c(-0.4,0), sul, col=colors, pch=16) #dev.off()