library(devtools)
build_vignettes()
build_vignettes()
library(FEAST)
data(Yan)
library(BiocParallel)
#' Standard way to preprocess the count matrix. It is the QC step for the genes.
#'
#' @param Y A gene expression data (Raw count matrix)
#' @param thre The threshold of minimum number of cells expressing a certain gene (default =2)
#' @return A processed gene expression matrix. It is \emph{not log transformed}
#' @examples
#' data(Yan)
#' YY = process_Y(Y, thre=2)
#' @export
process_Y = function(Y, thre = 2){
Y = as.matrix(Y)
row_exprs_rate = rowMeans(Y != 0)
row_sds = rowVars(Y)
ncell = ncol(Y)
rem_id1 = which(row_exprs_rate <= thre/ncell)
rem_id2 = which(row_sds == 0)
rem_id = union(rem_id1, rem_id2)
if (length(rem_id) > 0) {Y = Y[-rem_id, ]}
return(Y)
}
#' @param cluster The clustering outcomes. Specifically, they are cluster labels.
#' @param return_mses True or False indicating whether returning the MSE.
#' @return The MSE of the clustering centers with the predicted Y.
#' @examples
#' data(Yan)
#' Ynorm = Norm_Y(Y)
#' cluster = trueclass
#' MSE_res = cal_MSE(Ynorm, cluster)
#' @importFrom stats model.matrix
#' @export
cal_MSE = function(Ynorm, cluster, return_mses = FALSE){
Xregressor = model.matrix(~as.factor(cluster)-1)
beta = solve(t(Xregressor)%*%Xregressor) %*% (t(Xregressor)%*%t(Ynorm))
Yfit = Xregressor %*% beta
res = t(Ynorm) - Yfit
mse = mean(res^2)
if (return_mses){
mses = res^2
res = list(mse = mse, mses = mses)
}else{
res = mse
}
return(res)
}
#' Normalize the count expression matrix by the size factor and take the log transformation.
#'
#' @param Y a count expression matrix
#' @return a normalized matrix
#' @examples
#' data(Yan)
#' Ynorm = Norm_Y(Y)
#' @export
Norm_Y = function(Y){
L = colSums(Y)/median(colSums(Y))
Ynorm = log(sweep(Y, 2, L, FUN = "/") + 1)
return(Ynorm)
}
#' set up for the parallel computing for biocParallel.
#'
#' This function sets up the environment for parallel computing.
#' @param nProc number of processors
#' @param BPPARAM bpparameter from bpparam
#' @keywords internal
#' @return BAPPARAM settings
#' @examples
#' set_BPPARAM(nProc=1)
#' @export
setUp_BPPARAM = function (nProc = 0, BPPARAM = NULL)
{
if (is.null(BPPARAM)) {
if (nProc != 0) {
if (.Platform$OS.type == "windows") {
result <- SnowParam(workers = nProc)
}
else {
result <- MulticoreParam(workers = nProc)
}
}
else {
result <- bpparam()
}
return(result)
}
else {
return(BPPARAM)
}
}
Y = matrix(rnorm(10000), ncol = 100)
@export
#' data(Yan)
#' set.seed(123)
#' rixs = sample(nrow(Y), 500)
#' cixs = sample(ncol(Y), 40)
#' Y = Y[rixs, cixs]
#' con = Consensus(Y, k=5)
#' @importFrom stats median
#' @importFrom matrixStats rowVars
#' @importFrom matrixStats rowSds
#' @export
Consensus = function(Y, num_pcs = 10, top_pctg = 0.33, k =2, thred = 0.9, nPro = 1){
if (all(Y %%1 == 0)){
L = colSums(Y) / median(colSums(Y))
Y = log(sweep(Y, 2, L, FUN="/") + 1)
}
# select some genes (by top 50% cv) and do pca
message("start dimention reduction ...")
rm_ix = which(rowVars(Y) == 0)
if (length(rm_ix) > 0) Y = Y[-rm_ix, ]
row_ms = rowMeans(Y, na.rm = TRUE)
row_sds = rowSds(Y, na.rm = TRUE)
cv_scores = row_sds / row_ms
gene_ranks = order(cv_scores, decreasing = TRUE, na.last = TRUE)
top = round(nrow(Y) * top_pctg)
ixs = gene_ranks[seq_len(top)]
Y = Y[ixs, ]
pc_res = prcomp(t(Y))
# consensus clustering
message("start consensus clustering ...")
BPPARAM = setUp_BPPARAM(nPro=nPro)
BPPARAM$progressbar = TRUE
bp_fun = function(i, pc_res, k){
tmp_pca_mat = pc_res[,seq_len(i)]
if (i == 1) {
res = suppressWarnings(Mclust(tmp_pca_mat, G = k, modelNames = "V", verbose = FALSE))
}
else {
res = suppressWarnings(Mclust(tmp_pca_mat, G = k, modelNames = "VVV", verbose = FALSE))
}
if (is.null(res)){
res = suppressWarnings(Mclust(tmp_pca_mat, G = k, verbose = FALSE))
}
clusterid = apply(res$z, 1, which.max)
return(clusterid)
}
pc_cluster = bplapply(seq_len(num_pcs), bp_fun, pc_res = pc_res, k=k, BPPARAM = BPPARAM)
pc_mat = lapply(pc_cluster, vector2matrix)
con_mat = Reduce("+", pc_mat)
# final step of clustering
message("start final clustering ...")
res = suppressWarnings(Mclust(con_mat, G = k, modelNames = "VII", verbose = FALSE))
if (is.null(res)){
res = suppressWarnings(Mclust(con_mat, G = k, verbose = FALSE))
}
cluster = apply(res$z, 1, function(x){
id = which(x > thred)
if (length(id) == 0){
return(NA)
}else{
return(id)
}
})
return(list(mat_res = con_mat, cluster = cluster))
}
#' function for convert a vector to a binary matrix
#' @param vec a vector.
#' @return a n by n binary matrix indicating the adjacency.
#' @importFrom utils combn
vector2matrix = function(vec){
mat = matrix(0, nrow = length(vec), ncol = length(vec))
diag(mat) = diag(mat) + 1
classes = unique(vec)
for (class in classes){
tmp_ix = which(vec == class)
# find all pair index of a class
pair_ix = t(combn(tmp_ix, 2))
pair_ix = rbind(pair_ix, pair_ix[,c(2,1)])
mat[pair_ix] = 1
}
return(mat)
}
Y = process_Y(Y, thre = 2) # preprocess the data if needed
library(matrixStats)
Y = process_Y(Y, thre = 2) # preprocess the data if needed
con_res = Consensus(Y, k=k)
debugonce(Consensus)
con_res = Consensus(Y, k=k)
