% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generate_celltype_data.r
\name{generate_celltype_data}
\alias{generate_celltype_data}
\title{Generate CellTypeData (CTD) file}
\usage{
generate_celltype_data(
  exp,
  annotLevels,
  groupName,
  no_cores = 1,
  savePath = tempdir(),
  file_prefix = "ctd",
  as_sparse = TRUE,
  as_DelayedArray = FALSE,
  normSpec = FALSE,
  convert_orths = FALSE,
  input_species = "mouse",
  output_species = "human",
  non121_strategy = "drop_both_species",
  method = "homologene",
  force_new_file = TRUE,
  specificity_quantiles = TRUE,
  numberOfBins = 40,
  dendrograms = TRUE,
  return_ctd = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{exp}{Numerical matrix with row for each gene and column for each cell.
Row names are gene symbols. Column names are cell IDs which can be
cross referenced against the annot data frame.}

\item{annotLevels}{List with arrays of strings containing the cell type
names associated with each column in \code{exp}.}

\item{groupName}{A human readable name for referring to the dataset
being used.}

\item{no_cores}{Number of cores that should be used to speedup the
computation.
\emph{NOTE}: Use \code{no_cores=1} when using this package in windows system.}

\item{savePath}{Directory where the CTD file should be saved.}

\item{file_prefix}{Prefix to add to saved CTD file name.}

\item{as_sparse}{Convert \code{exp} to a sparse \code{Matrix}.}

\item{as_DelayedArray}{Convert \code{exp} to \code{DelayedArray}.}

\item{normSpec}{Boolean indicating whether specificity data should be
transformed to a normal distribution by cell type, giving equivalent scores
across all cell types.}

\item{convert_orths}{If \code{input_species!=output_species} and
\code{convert_orths=TRUE}, will drop genes without
1:1 \code{output_species} orthologs and then convert \code{exp} gene names
to those of \code{output_species}.}

\item{input_species}{The species that the \code{exp} dataset comes from.
See \link[EWCE]{list_species} for all available species.}

\item{output_species}{Species to convert \code{exp} to
(Default: "human").
See \link[EWCE]{list_species} for all available species.}

\item{non121_strategy}{How to handle genes that don't have
1:1 mappings between \code{input_species}:\code{output_species}.
Options include:\cr
\describe{
\item{\code{"drop_both_species" or "dbs" or 1}}{
Drop genes that have duplicate
mappings in either the \code{input_species} or \code{output_species}
(\emph{DEFAULT}).}
\item{\code{"drop_input_species" or "dis" or 2}}{
Only drop genes that have duplicate
mappings in the \code{input_species}.}
\item{\code{"drop_output_species" or "dos" or 3}}{
Only drop genes that have duplicate
mappings in the \code{output_species}.}
\item{\code{"keep_both_species" or "kbs" or 4}}{
Keep all genes regardless of whether
they have duplicate mappings in either species.}
\item{\code{"keep_popular" or "kp" or 5}}{
Return only the most "popular" interspecies ortholog mappings.
 This procedure tends to yield a greater number of returned genes
 but at the cost of many of them not being true biological 1:1 orthologs.}
 \item{\code{"sum","mean","median","min" or "max"}}{
 When \code{gene_df} is a matrix and \code{gene_output="rownames"},
  these options will aggregate many-to-one gene mappings
  (\code{input_species}-to-\code{output_species})
  after dropping any duplicate genes in the \code{output_species}.
 }
}}

\item{method}{R package to use for gene mapping:
\describe{
 \item{\code{"gprofiler"}}{Slower but more species and genes.}
 \item{\code{"homologene"}}{Faster but fewer species and genes.}
 \item{\code{"babelgene"}}{Faster but fewer species and genes.
 Also gives consensus scores for each gene mapping based on a
 several different data sources.}
}}

\item{force_new_file}{If a file of the same name as the one
being created already exists, overwrite it.}

\item{specificity_quantiles}{Compute specificity quantiles.
Recommended to set to \code{TRUE}.}

\item{numberOfBins}{Number of quantile 'bins' to use (40 is recommended).}

\item{dendrograms}{Add dendrogram plots}

\item{return_ctd}{Return the CTD object in a list along with the file name,
instead of just the file name.}

\item{verbose}{Print messages.}

\item{...}{
  Arguments passed on to \code{\link[orthogene:convert_orthologs]{orthogene::convert_orthologs}}
  \describe{
    \item{\code{gene_df}}{Data object containing the genes
(see \code{gene_input} for options on how
the genes can be stored within the object).\cr
Can be one of the following formats:\cr
\describe{
\item{\code{matrix}}{A sparse or dense matrix.}
\item{\code{data.frame}}{A \code{data.frame},
 \code{data.table}. or \code{tibble}.}
\item{\code{list}}{A \code{list} or character \code{vector}.}
}
Genes, transcripts, proteins, SNPs, or genomic ranges
 can be provided in any format
(HGNC, Ensembl, RefSeq, UniProt, etc.) and will be
automatically converted to gene symbols unless
specified otherwise with the \code{...} arguments.\cr
\emph{Note}: If you set \code{method="homologene"}, you
must either supply genes in gene symbol format (e.g. "Sox2")
 OR set \code{standardise_genes=TRUE}.}
    \item{\code{gene_input}}{Which aspect of \code{gene_df} to
get gene names from:\cr
\describe{
\item{\code{"rownames"}}{From row names of data.frame/matrix.}
\item{\code{"colnames"}}{From column names of data.frame/matrix.}
\item{\code{<column name>}}{From a column in \code{gene_df},
 e.g. \code{"gene_names"}.}
}}
    \item{\code{gene_output}}{How to return genes.
Options include:\cr
\describe{
\item{\code{"rownames"}}{As row names of \code{gene_df}.}
\item{\code{"colnames"}}{As column names of \code{gene_df}.}
\item{\code{"columns"}}{As new columns "input_gene", "ortholog_gene"
(and "input_gene_standard" if \code{standardise_genes=TRUE})
in \code{gene_df}.}
\item{\code{"dict"}}{As a dictionary (named list) where the names
are input_gene and the values are ortholog_gene.}
\item{\code{"dict_rev"}}{As a reversed dictionary (named list)
where the names are ortholog_gene and the values are input_gene.}
}}
    \item{\code{standardise_genes}}{If \code{TRUE} AND
\code{gene_output="columns"}, a new column "input_gene_standard"
will be added to \code{gene_df} containing standardised HGNC symbols
identified by \link[gprofiler2]{gorth}.}
    \item{\code{drop_nonorths}}{Drop genes that don't have an ortholog
in the \code{output_species}.}
    \item{\code{agg_fun}}{Aggregation function passed to 
 \link[orthogene]{aggregate_mapped_genes}. 
Set to \code{NULL} to skip aggregation step (default).}
    \item{\code{mthreshold}}{Maximum number of ortholog names per gene to show.
Passed to \link[gprofiler2]{gorth}.
Only used when \code{method="gprofiler"} (\emph{DEFAULT : }\code{Inf}).}
    \item{\code{sort_rows}}{Sort \code{gene_df} rows alphanumerically.}
    \item{\code{gene_map}}{A \link[base]{data.frame} that maps the current gene names
to new gene names. 
This function's behaviour will adapt to different situations as follows: 
\describe{
\item{\code{gene_map=<data.frame>}}{When a data.frame containing the
gene key:value columns 
(specified by \code{input_col} and \code{output_col}, respectively)
is provided, this will be used to perform aggregation/expansion.}
\item{\code{gene_map=NULL} and \code{input_species!=output_species}}{
A \code{gene_map} is automatically generated by
 \link[orthogene]{map_orthologs} to perform inter-species 
 gene aggregation/expansion.}
\item{\code{gene_map=NULL} and \code{input_species==output_species}}{
A \code{gene_map} is automatically generated by
 \link[orthogene]{map_genes} to perform within-species 
 gene gene symbol standardization and aggregation/expansion.}
}}
    \item{\code{input_col}}{Column name within \code{gene_map} with gene names matching 
the row names of \code{X}.}
    \item{\code{output_col}}{Column name within \code{gene_map} with gene names
that you wish you map the row names of \code{X} onto.}
  }}
}
\value{
File names for the saved CellTypeData (CTD) files.
}
\description{
\code{generate_celltype_data} takes gene expression data and
cell type annotations and creates CellTypeData (CTD) files which
contain matrices of mean expression and specificity per cell type.
}
\examples{
# Load the single cell data
cortex_mrna <- ewceData::cortex_mrna()
# Use only a subset to keep the example quick
expData <- cortex_mrna$exp[1:100, ]
l1 <- cortex_mrna$annot$level1class
l2 <- cortex_mrna$annot$level2class
annotLevels <- list(l1 = l1, l2 = l2)
fNames_ALLCELLS <- EWCE::generate_celltype_data(
    exp = expData,
    annotLevels = annotLevels,
    groupName = "allKImouse"
)
}
