% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/phenodisco.R
\name{phenoDisco}
\alias{phenoDisco}
\title{Runs the \code{phenoDisco} algorithm.}
\usage{
phenoDisco(
  object,
  fcol = "markers",
  times = 100,
  GS = 10,
  allIter = FALSE,
  p = 0.05,
  ndims = 2,
  modelNames = mclust.options("emModelNames"),
  G = 1:9,
  BPPARAM,
  tmpfile,
  seed,
  verbose = TRUE,
  dimred = c("PCA", "t-SNE"),
  ...
)
}
\arguments{
\item{object}{An instance of class \code{MSnSet}.}

\item{fcol}{A \code{character} indicating the organellar markers
column name in feature meta-data. Default is \code{markers}.}

\item{times}{Number of runs of tracking. Default is 100.}

\item{GS}{Group size, i.e how many proteins make a group. Default
is 10 (the minimum group size is 4).}

\item{allIter}{\code{logical}, defining if predictions for all
iterations should be saved. Default is \code{FALSE}.}

\item{p}{Significance level for outlier detection. Default is
0.05.}

\item{ndims}{Number of principal components to use as input for
the disocvery analysis. Default is 2. Added in version 1.3.9.}

\item{modelNames}{A vector of characters indicating the models to
be fitted in the EM phase of clustering using
\code{Mclust}. The help file for \code{mclust::mclustModelNames}
describes the available models. Default model names are
\code{c("EII", "VII", "EEI", "VEI", "EVI", "VVI", "EEE",
"EEV", "VEV", "VVV")}, as returned by
\code{mclust.options("emModelNames")}. Note that using all
these possible models substantially increases the running
time. Legacy models are \code{c("EEE","EEV","VEV","VVV")},
i.e. only ellipsoidal models.}

\item{G}{An integer vector specifying the numbers of mixture
components (clusters) for which the BIC is to be
calculated. The default is \code{G=1:9} (as in \code{Mclust}).}

\item{BPPARAM}{Support for parallel processing using the
\code{BiocParallel} infrastructure. When missing (default),
the default registered \code{BiocParallelParam} parameters are
used. Alternatively, one can pass a valid
\code{BiocParallelParam} parameter instance: \code{SnowParam},
\code{MulticoreParam}, \code{DoparParam}, \ldots see the
\code{BiocParallel} package for details. To revert to the
origianl serial implementation, use \code{NULL}.}

\item{tmpfile}{An optional \code{character} to save a temporary
\code{MSnSet} after each iteration. Ignored if missing. This
is useful for long runs to track phenotypes and possibly kill
the run when convergence is observed. If the run completes,
the temporary file is deleted before returning the final
result.}

\item{seed}{An optional \code{numeric} of length 1 specifing the
random number generator seed to be used. Only relevant when
executed in serialised mode with \code{BPPARAM = NULL}. See
\code{BPPARAM} for details.}

\item{verbose}{Logical, indicating if messages are to be printed
out during execution of the algorithm.}

\item{dimred}{A \code{characater} defining which of Principal
Component Analysis (\code{"PCA"}) or t-Distributed Stochastic
Neighbour Embedding (\code{"t-SNE"}) should be use to reduce
dimensions prior to running phenoDisco novelty detection.}

\item{...}{Additional arguments passed to the dimensionality
reduction method. For both PCA and t-SNE, the data is scaled
and centred by default, and these parameters (\code{scale} and
\code{centre} for PCA, and \code{pca_scale} and
\code{pca_center} for t-SNE can't be set). When using t-SNE
however, it is important to tune the perplexity and max
iterations parameters. See the \emph{Dimensionality reduction}
section in the pRoloc vignette for details.}
}
\value{
An instance of class \code{MSnSet} containing the
    \code{phenoDisco} predictions.
}
\description{
\code{phenoDisco} is a semi-supervised iterative approach to
detect new protein clusters.
}
\details{
The algorithm performs a phenotype discovery analysis as described
in Breckels et al. Using this approach one can identify putative
subcellular groupings in organelle proteomics experiments for more
comprehensive validation in an unbiased fashion. The method is
based on the work of Yin et al. and used iterated rounds of
Gaussian Mixture Modelling using the Expectation Maximisation
algorithm combined with a non-parametric outlier detection test to
identify new phenotype clusters.

One requires 2 or more classes to be labelled in the data and at a
very minimum of 6 markers per class to run the algorithm.  The
function will check and remove features with missing values using
the \code{\link{filterNA}} method.

A parallel implementation, relying on the \code{BiocParallel}
package, has been added in version 1.3.9. See the \code{BPPARAM}
arguent for details.

Important: Prior to version 1.1.2 the row order in the output was
different from the row order in the input. This has now been fixed
and row ordering is now the same in both input and output objects.
}
\examples{
\dontrun{
library(pRolocdata)
data(tan2009r1)
pdres <- phenoDisco(tan2009r1, fcol = "PLSDA")
getPredictions(pdres, fcol = "pd", scol = NULL)
plot2D(pdres, fcol = "pd")

## to pre-process the data with t-SNE instead of PCA
pdres <- phenoDisco(tan2009r1, fcol = "PLSDA", dimred = "t-SNE")
}
}
\references{
Yin Z, Zhou X, Bakal C, Li F, Sun Y, Perrimon N, Wong
    ST. Using iterative cluster merging with improved gap
    statistics to perform online phenotype discovery in the
    context of high-throughput RNAi screens. BMC
    Bioinformatics. 2008 Jun 5;9:264.  PubMed PMID: 18534020.

Breckels LM, Gatto L, Christoforou A, Groen AJ, Lilley KS and
Trotter MWB.  The Effect of Organelle Discovery upon Sub-Cellular
Protein Localisation.  J Proteomics. 2013 Aug 2;88:129-40. doi:
10.1016/j.jprot.2013.02.019. Epub 2013 Mar 21.  PubMed PMID:
23523639.
}
\author{
Lisa M. Breckels <lms79@cam.ac.uk>
}
