% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/quality_control.R
\name{tof_assess_clusters_distance}
\alias{tof_assess_clusters_distance}
\title{Assess a clustering result by calculating the z-score of each cell's
mahalanobis distance to its cluster centroid and flagging outliers.}
\usage{
tof_assess_clusters_distance(
  tof_tibble,
  cluster_col,
  marker_cols = where(tof_is_numeric),
  z_threshold = 3,
  augment = FALSE
)
}
\arguments{
\item{tof_tibble}{A `tof_tbl` or `tibble`.}

\item{cluster_col}{An unquoted column name indicating which column in `tof_tibble`
stores the cluster ids for the cluster to which each cell belongs.
Cluster labels can be produced via any method the user chooses - including manual gating,
any of the functions in the `tof_cluster_*` function family, or any other method.}

\item{marker_cols}{Unquoted column names indicating which column in `tof_tibble`
should be interpreted as markers to be used in the mahalanobis distance calculation.
Defaults to all numeric columns. Supports tidyselection.}

\item{z_threshold}{A scalar indicating the distance z-score threshold above
which a cell should be considered anomalous. Defaults to 3.}

\item{augment}{A boolean value indicating if the output should column-bind the
computed flags for each cell (see below) as new columns in `tof_tibble` (TRUE) or if
a tibble including only the computed flags should be returned (FALSE, the default).}
}
\value{
If augment = FALSE (the default), a tibble with 3 columns:
".mahalanobis_distance" (the mahalanobis distance from each cell to the centroid of
tits assigned cluster), "z_score" (the modified z-score of each cell's mahalanobis distance
relative to all other cells in the dataset), and "flagged_cell" (a boolean
indicating whether or not each cell was flagged as having a z-score above
z_threshold). If augment = TRUE, the same 3 columns will be column-bound to
tof_tibble, and the resulting tibble will be returned.
}
\description{
This function evaluates the result of a clustering procedure by comparing
the mahalanobis distance between each cell and the centroid of the cluster
to which it was assigned among all cells in a given cluster. All cells with
a mahalanobis-distance z-score above a user-specified threshold are flagged
as potentially anomalous. Note that the z-score is calculated using a modified
formula to minimize the effect of outliers (Z = x - median(x) / mad(x)).
}
\examples{

# simulate data
sim_data_inner <-
    dplyr::tibble(
        cd45 = c(rnorm(n = 600), rnorm(n = 500, mean = -4)),
        cd38 =
            c(
                rnorm(n = 100, sd = 0.5),
                rnorm(n = 500, mean = -3),
                rnorm(n = 500, mean = 8)
            ),
        cd34 =
            c(
                rnorm(n = 100, sd = 0.2, mean = -10),
                rnorm(n = 500, mean = 4),
                rnorm(n = 500, mean = 60)
            ),
        cd19 = c(rnorm(n = 100, sd = 0.3, mean = 10), rnorm(n = 1000)),
        cluster_id = c(rep("a", 100), rep("b", 500), rep("c", 500)),
        dataset = "inner"
    )

sim_data_outer <-
    dplyr::tibble(
        cd45 = c(rnorm(n = 10), rnorm(50, mean = 3), rnorm(n = 50, mean = -12)),
        cd38 =
            c(
                rnorm(n = 10, sd = 0.5),
                rnorm(n = 50, mean = -10),
                rnorm(n = 50, mean = 10)
            ),
        cd34 =
            c(
                rnorm(n = 10, sd = 0.2, mean = -15),
                rnorm(n = 50, mean = 15),
                rnorm(n = 50, mean = 70)
            ),
        cd19 = c(rnorm(n = 10, sd = 0.3, mean = 19), rnorm(n = 100)),
        cluster_id = c(rep("a", 10), rep("b", 50), rep("c", 50)),
        dataset = "outer"
    )

sim_data <- rbind(sim_data_inner, sim_data_outer)

# detect anomalous cells (in this case, the "outer" dataset contains small
# clusters that get lumped into the larger clusters in the "inner" dataset)
z_result <-
    sim_data |>
    tof_assess_clusters_distance(cluster_col = cluster_id, z_threshold = 2.5)

}
