\name{get_bubbletree_kmeans}



\alias{get_bubbletree_kmeans}




\title{
k-means clustering and hierarchical grouping of \eqn{k} clusters (bubbles)
}



\description{
get_bubble_kmeans takes two main inputs:

1. numeric matrix \eqn{A^{n \times f}}, which represents a low-dimensional
projection (obtained e.g. by PCA) of the original high-dimensional scRNA-seq
data, with \eqn{n} rows as cells and \eqn{f} columns as low-dimension features.

2. number \eqn{k} of clusters

The function \code{get_bubble_kmeans} performs two main operations. First, it
performs k-means clustering to identify groups (bubbles) of transcriptionally
similar cells. Second, it organizes the bubbles in a hierarchical dendrogram
(bubbletree) which adequatly represents inter-cluster relationships.
}

\usage{
get_bubbletree_kmeans(x,
                      k,
                      B = 200,
                      N_eff = 100,
                      n_start = 1000,
                      iter_max = 300,
                      kmeans_algorithm = "MacQueen",
                      hclust_distance = "euclidean",
                      hclust_method = "average",
                      cores = 1,
                      round_digits = 2,
                      show_simple_count = FALSE,
                      verbose = TRUE)
}


\arguments{
\item{x}{numeric matrix (\eqn{A^{n\times f}} with \eqn{n} cells, and \eqn{f}
low-dimensional projections of the original single cell RNA-seq dataset)}
\item{k}{integer, number of clusters}
\item{B}{integer, number of bootstrap iterations to perform in order to
generate bubbletree}
\item{N_eff}{integer, number of cells to draw randomly from each cluster when
computing inter-cluster distances}
\item{n_start, iter_max, kmeans_algorithm}{parameters for k-means clustering,
see documentation of function \code{k-means}, R-package \code{stats}}
\item{hclust_distance}{distance measure to be used: euclidean (default) or
manhattan, see documentation of \code{stats::dist}}
\item{hclust_method}{the agglomeration method to be used, default = average.
See documentation of \code{stats::hclust}}
\item{cores}{integer, number of PC cores for parallel execution}
\item{round_digits}{integer, number of decimal places to keep when showing the
relative frequency of cells in each bubble}
\item{show_simple_count}{logical, if \code{show_simple_count}=T, cell counts
in each bubble will be divided by 1,000 to improve readability. This is only
useful for samples that are composed of millions of cells.}
\item{verbose}{logical, progress messages}
}


\details{
For k-means clustering \code{get_bubble_kmeans} uses the function kmeans
implemented in R-package \code{stats} (version 4.2.0). For additional
information on the clustering procedure see the documentation of \code{kmeans}.
To organize the resulting clusters in a hierarchical dendrogram these steps
are performed:

1. In bootrap iteration \eqn{b} from \eqn{1:B}

2. draw up to \eqn{N_{eff}} number of cells at random from each cluster without
   replacement

3. compute distances (in space \eqn{A^{n\times f}}) between pairs
   of cells in cluster \eqn{i} and cluster \eqn{j}

4. compute mean distance between cluster \eqn{i} and \eqn{j} and
   populate inter-cluster distance matrix \eqn{D_{b}^{k \times k}}

5. perform hierarchical clustering with user-specified agglomeration method
   based on \eqn{D_{b}^{k \times k}} to generate dendrogram \eqn{H_b}

6. quantify branch robustness in \eqn{H} by counting how many times each branch
   is found among the bootrap dendrograms \eqn{H_b}
}

\value{
\item{A}{input matrix x}
\item{k}{number of clusters}
\item{km}{k-means clustering results identical to those generated by function
          k-means from R-package stats}
\item{ph}{boot_ph: bootstrap dendrograms \eqn{H_b}; main_ph: bubbletree \eqn{H}}
\item{ph_data}{two phlogenies: ph_c = phylogenity constructed from bubble 
centroids (computed from \eqn{A^{n\times f}}); ph_p = main_ph = phylogeny 
constructed from intercell distances}
\item{pair_dist}{inter-cluster distances used to generate the dendrograms}
\item{cluster}{cluster assignments of each cell}
\item{input_par}{list of all input parameters}
\item{tree}{ggtree bubbletree object}
\item{tree_simple}{simplified ggtree bubbletree object}
\item{tree_meta}{meta-data associated with the bubbletree}
}


\author{
  Simo Kitanovski \email{simo.kitanovski@uni-due.de}
}


\examples{
# input data
data("d_500", package = "scBubbletree")
A <- d_500$A

b <- get_bubbletree_kmeans(x = A,
                           k = 8,
                           B = 200,
                           N_eff = 100,
                           cores = 1,
                           round_digits = 1,
                           show_simple_count = FALSE,
                           kmeans_algorithm = "MacQueen",
                           hclust_distance = "euclidean",
                           hclust_method = "average")

b$tree
}

\seealso{get_k, get_bubbletree_dummy, get_bubbletree_graph, get_gini,
         get_gini_k, d_500, get_num_tiles, get_num_violins, get_cat_tiles,
         get_bubbletree_comparison}
