% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/patient-level_modeling.R
\name{tof_split_data}
\alias{tof_split_data}
\title{Split high-dimensional cytometry data into a training and test set}
\usage{
tof_split_data(
  feature_tibble,
  split_method = c("k-fold", "bootstrap", "simple"),
  split_col,
  simple_prop = 3/4,
  num_cv_folds = 10,
  num_cv_repeats = 1L,
  num_bootstraps = 10,
  strata = NULL,
  ...
)
}
\arguments{
\item{feature_tibble}{A tibble in which each row represents a sample- or patient-
level observation, such as those produced by \code{tof_extract_features}.}

\item{split_method}{Either a string or a logical vector specifying how to perform
the split. If a string, valid options include k-fold cross validation
("k-fold"; the default), bootstrapping ("bootstrap"), or
a single binary split ("simple"). If a logical vector, it should contain one entry
for each row in `feature_tibble` indicating if that row should be included in the
training set (TRUE) or excluded for the validation/test set (FALSE).
Ignored entirely if `split_col` is specified.}

\item{split_col}{The unquoted column name of the logical column in `feature_tibble`
indicating if each row should be included in the
training set (TRUE) or excluded for the validation/test set (FALSE).}

\item{simple_prop}{A numeric value between 0 and 1 indicating what proportion of the data
should be used for training. Defaults to 3/4. Ignored if split_method is not "simple".}

\item{num_cv_folds}{An integer indicating how many cross-validation folds should be used.
Defaults to 10. Ignored if split_method is not "k-fold".}

\item{num_cv_repeats}{An integer indicating how many independent
cross-validation replicates should be used (i.e. how many num_cv_fold splits
should be performed). Defaults to 1. Ignored if split_method is not "k-fold".}

\item{num_bootstraps}{An integer indicating how many independent bootstrap
replicates should be used. Defaults to 25. Ignored if split_method is not
"bootstrap".}

\item{strata}{An unquoted column name representing the column in \code{feature_tibble}
that should be used to stratify the data splitting. Defaults to NULL (no stratification).}

\item{...}{Optional additional arguments to pass to \code{\link[rsample]{vfold_cv}}
for k-fold cross validation, \code{\link[rsample]{bootstraps}} for bootstrapping,
or \code{\link[rsample]{initial_split}} for simple splitting.}
}
\value{
If for k-fold cross validation and bootstrapping, an "rset" object;
for simple splitting, an "rsplit" object. For details, see
\code{\link[rsample]{rsample}}.
}
\description{
Split high-dimensional cytometry data into a training and test set
}
\examples{
feature_tibble <-
    dplyr::tibble(
        sample = as.character(1:100),
        cd45 = runif(n = 100),
        pstat5 = runif(n = 100),
        cd34 = runif(n = 100),
        outcome = (3 * cd45) + (4 * pstat5) + rnorm(100),
        class =
            as.factor(
                dplyr::if_else(outcome > median(outcome), "class1", "class2")
            ),
        multiclass =
            as.factor(
                c(rep("class1", 30), rep("class2", 30), rep("class3", 40))
            ),
        event = c(rep(0, times = 50), rep(1, times = 50)),
        time_to_event = rnorm(n = 100, mean = 10, sd = 2)
    )

# split the dataset into 10 CV folds
tof_split_data(
    feature_tibble = feature_tibble,
    split_method = "k-fold"
)

# split the dataset into 10 bootstrap resamplings
tof_split_data(
    feature_tibble = feature_tibble,
    split_method = "bootstrap"
)

# split the dataset into a single training/test set
# stratified by the "class" column
tof_split_data(
    feature_tibble = feature_tibble,
    split_method = "simple",
    strata = class
)

}
\seealso{
Other modeling functions: 
\code{\link{tof_assess_model}()},
\code{\link{tof_create_grid}()},
\code{\link{tof_predict}()},
\code{\link{tof_train_model}()}
}
\concept{modeling functions}
