% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/group_labelling_functions.r
\name{make_ref_similarity_names}
\alias{make_ref_similarity_names}
\alias{make_ref_similarity_names_using_marked}
\title{make_ref_similarity_names}
\usage{
make_ref_similarity_names(de_table.test, de_table.ref, pval = 0.01,
  num_steps = 5, rankmetric = "TOP100_LOWER_CI_GTE1", n = 100)

make_ref_similarity_names_using_marked(de_table.ref.marked,
  de_table.recip.marked = NA, the_test_dataset = NA,
  the_ref_dataset = NA, pval = 0.01, num_steps = 5)
}
\arguments{
\item{de_table.test}{A differential expression table of the query 
experiment, as generated from 
\code{\link{contrast_each_group_to_the_rest}}}

\item{de_table.ref}{A differential expression table of the reference 
dataset, as generated from 
\code{\link{contrast_each_group_to_the_rest}}}

\item{pval}{Differences between the rescaled ranking distribution of 'top'
genes on different reference groups are tested with a Mann-Whitney U test. 
If they are \emph{significantly different}, 
only the top group(s) are reported. 
It isn't a simple cutoff threshold as it can change the number of similar 
groups reported.  ie. A more stringent \bold{pval} is more likely to decide 
that groups are similar -
which would result in multiple group reporting, or no similarity at all.
Unlikely that this parameter will ever need to change. Default = 0.01.}

\item{num_steps}{After ranking reference groups according to median 'top' 
gene ranking, how many adjacent pairs to test for differences. 
Set to 1 to only compare each group to the next, or NA to perform an 
all-vs-all comparison. 
Setting too low may means it is possible to miss groups with some similarity 
to the reported matches (\emph{similar_non_match} column)).
Too high (or NA) with a large number of reference groups could be slow. 
Default = 5.}

\item{rankmetric}{Specifiy ranking method used to pick the
'top' genes. The default 'TOP100_LOWER_CI_GTE1' picks genes from the top 100
overrepresented genes (ranked by inner 95% confidence interval) - appears to 
work best for distinct cell types (e.g. tissue sample.). 'TOP100_SIG' again 
picks from the top 100 ranked genes, but requires only statistical 
significance, 95% CI threshold - may perform better on more similar cell 
clusters (e.g. PBMCs).}

\item{n}{For tweaking maximum returned genes from different ranking methods.}

\item{de_table.ref.marked}{The output of 
\code{\link{get_the_up_genes_for_all_possible_groups}} for the contrast 
of interest.}

\item{de_table.recip.marked}{Optional. The (reciprocal) output of 
\code{\link{get_the_up_genes_for_all_possible_groups}} with the test and 
reference datasets swapped. 
If omitted a reciprocal test will not be done. Default = NA.}

\item{the_test_dataset}{Optional. A short meaningful name for the 
experiment. 
(Should match \emph{test_dataset} column in \bold{de_table.marked}). 
Only needed in a table of more than one dataset. Default = NA.}

\item{the_ref_dataset}{Optional. A short meaningful name for the 
experiment. 
(Should match \emph{dataset} column in \bold{de_table.marked}). 
Only needed in a table of more than one dataset. Default = NA.}
}
\value{
A table of automagically-generated labels for each query group, 
given their similarity to reference groups. 

The columns in this table:
\itemize{
  \item \bold{test_group} : Query group e.g. "c1"
  \item \bold{shortlab} : The cluster label described above e.g. 
  "c1:macrophage"
  \item \bold{pval} : If there is a similarity flagged, this is the P-value 
  from a Mann-Whitney U test from the last 'matched' group to the adjacent  
  'non-matched' group. Ie. If only one label in shortlab, this will be the 
  first of the stepped_pvals, if there are 2, it will be the second. 
  If there is 'no_similarity' this will be NA 
  (Because there is no confidence in what 
  is the most appropriate of the all non-significant stepped pvalues.).
  \item \bold{stepped_pvals} : P-values from Mann-Whitney U tests across 
  adjacent pairs of reference groups ordered from most to least similar 
  (ascending median rank).
  ie. 1st-2nd most similar first, 2nd-3rd, 3rd-4th e.t.c. The last value 
  will always be NA (no more reference group).
  e.g. 
  refA:8.44e-10,refB:2.37e-06,refC:0.000818,refD:0.435,refE:0.245,refF:NA
  \item \bold{pval_to_random} : P-value of test of median rank (of last 
  matched reference group) < random, from binomial test on top gene 
  ranks (being < 0.5). 
  \item \bold{matches} : List of all reference groups that 'match', 
  as described, except it also includes (rare) examples where 
  pval_to_random is not significant. "|" delimited.
  \item \bold{reciprocal_matches} : List of all reference groups that  
  flagged test group as a match when directon of comparison is reversed.
  (significant pval and pval_to_random). "|" delimited.
  \item \bold{similar_non_match}: This column lists any reference groups 
  outside of shortlab that are not signifcantly different to a reported 
  match group. Limited by \emph{num_steps}, and will never find anything 
  if num_steps==1. "|" delimited. Usually NA.
  \item \bold{similar_non_match_detail} : P-values for any details about 
  similar_non_match results. These p-values will always be non-significant.
  E.g. "A > C (p=0.0214,n.s)". "|" delimited. Usually NA.
  \item \bold{differences_within} :  This feild lists any pairs of 
  reference groups in shortlab that are significantly different. 
  "|" delimited. Usually NA.
}
}
\description{
Construct some sensible labels or the groups/clusters in the query dataset, 
based on similarity the reference dataset.

This is a more low level/customisable version of 
\code{\link{make_ref_similarity_names}}, (would usually use that instead).  
Suitable for rare cases to reuse an existing \bold{de_table.ref.marked} 
object. Or use a \bold{de_table.ref.marked} table with more than one dataset
present (discoraged). Or to skip the reciprocal comparison step.
}
\details{
This function aims to report a) the top most similar reference group, if 
there's a clear frontrunner, b) A list of multiple similar groups if they 
have similar similarity, or c) 'No similarity', if there is none.

Each group is named according to the following rules. 
Testing for significant 
(smaller) differences with a one-directional Mann-Whitney U test on their 
rescaled ranks:
\enumerate{
  \item The first (as ranked by median rescaled rank) reference group is 
  significantly more similar than the next: Report \emph{first only}.
  \item When comparing differences betwen groups stepwise ranked by 
  median rescaled rank - no group is significantly different to its 
  neighbour: Report \emph{no similarity}
  \item There's no significant differences in the stepwise comparisons 
  of the first N reference groups - but there is a significant 
  difference later on : Report \emph{multiple group similarity}
}

There are some further heuristic caveats:
\enumerate{
  \item The distribution of top genes in the last (or only) match group is 
  tested versus a theroetical random distribution around 0.5 (as reported 
  in \emph{pval_vs_random} column). If the distribution is not 
  significantly above random  
  (It is possible in edge cases where there is a skewed dataset
  and no/few matches),
  \emph{no similarity} is reported. The significnat \emph{pval} column is 
  left intact.
  \item The comparison is repeated reciprocally - reference groups vs the 
  query groups. This helps sensitivity of heterogenous query groups - 
  and investigating the reciprocal matches can be informative in these 
  cases.
  If a query group doens't 'match' a reference group, but the reference 
  group does match that query group - it is reported in the group label in 
  brackets.
  e.g. \emph{c1:th_lymphocytes(tc_lympocytes)}. 
  Its even possible if there was no match (and pval = NA) 
  e.g. emph{c2:(tc_lymphocytes)}
}



The similarity is formatted into a group label. Where there are 
multiple similar groups, they're listed from most to least similar by their 
median ranks.
 
For instance, a query dataset of clusters c1, c2, c3 and c4 againsts a 
cell-type labelled reference datatset might get names like:
E.g.
\itemize{
  \item c1:macrophage
  \item c2:endotheial|mesodermal
  \item c3:no_similarity
  \item c4:mesodermal(endothelial)
}

Function \code{make_ref_similarity_names} is a convenience wrapper function 
for \code{make_ref_similarity_names_from_marked}. It accepts two 'de_table' 
outputs of function \code{contrast_each_group_to_the_rest} to compare
and handles running
\code{\link{get_the_up_genes_for_all_possible_groups}}. 
Sister function \code{make_ref_similarity_names_from_marked} may (rarely) be 
of use if the \bold{de_table.marked} object has already been created, 
or if reciprocal tests are not wanted.
}
\section{Functions}{
\itemize{
\item \code{make_ref_similarity_names_using_marked}: Construct some sensible cluster 
labels, but using a premade marked table.
}}

\examples{

# Make input
# de_table.demo_query <- contrast_each_group_to_the_rest(demo_query_se, "demo_query")
# de_table.demo_ref   <- contrast_each_group_to_the_rest(demo_ref_se,   "demo_ref")

make_ref_similarity_names(de_table.demo_query, de_table.demo_ref)
make_ref_similarity_names(de_table.demo_query, de_table.demo_ref, num_steps=3)
make_ref_similarity_names(de_table.demo_query, de_table.demo_ref, num_steps=NA)


# Make input
# de_table.demo_query <- contrast_each_group_to_the_rest(demo_query_se, "demo_query")
# de_table.demo_ref   <- contrast_each_group_to_the_rest(demo_ref_se,   "demo_ref")

de_table.marked.query_vs_ref <- get_the_up_genes_for_all_possible_groups(
     de_table.demo_query, de_table.demo_ref) 
de_table.marked.reiprocal <- get_the_up_genes_for_all_possible_groups(
     de_table.demo_ref, de_table.demo_query)
     

make_ref_similarity_names_using_marked(de_table.marked.query_vs_ref, 
                                       de_table.marked.reiprocal)
                                       
make_ref_similarity_names_using_marked(de_table.marked.query_vs_ref)


}
\seealso{
\code{\link{contrast_each_group_to_the_rest}} For 
preparing de_table input

\code{\link{get_the_up_genes_for_all_possible_groups}} 
To prepare the \bold{de_table.ref.marked} input.
}
