% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/methods.R
\name{group_sim}
\alias{group_sim}
\title{Semantic similarity between two groups of terms}
\usage{
group_sim(
  dag,
  group1,
  group2,
  method,
  control = list(),
  verbose = simona_opt$verbose
)
}
\arguments{
\item{dag}{An \code{ontology_DAG} object.}

\item{group1}{A vector of term names or a list of term vectors.}

\item{group2}{A vector of term names or a list of term vectors..}

\item{method}{A group similarity method. All available methods are in \code{\link[=all_group_sim_methods]{all_group_sim_methods()}}.}

\item{control}{A list of parameters passing to individual methods. The term similarity method is controlled by \code{term_sim_method}
and the IC method is controlled by \code{IC_method}. Other term similarity related parameters can also be specified in \code{control}. See the subsections.}

\item{verbose}{Whether to print messages.}
}
\value{
A numeric scalar, a numeric vector or a matrix depending on the dat type of \code{group1} and \code{group2}.
}
\description{
Semantic similarity between two groups of terms
}
\details{
If \code{annotation} is set in \code{create_ontology_DAG()} and you want to directly calculate semantic similarity between two
annotated items, you can first get the associated terms of the two items by \code{\link[=annotated_terms]{annotated_terms()}}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group1 = annotated_terms(dag, item1)[[1]]
group2 = annotated_terms(dag, item2)[[1]]
group_sim(dag, group1, group2, ...)
}\if{html}{\out{</div>}}
}
\section{Methods}{



\subsection{GroupSim_pairwise_avg}{

Denote \code{S(a, b)} as the semantic similarity between terms \code{a} and \code{b} where \code{a} is from \code{group1} and \code{b} is from \code{group2},
The similarity between \code{group1} and \code{group2} is the average similarity of every pair of individual terms in the two groups:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = mean_\{a in group1, b in group2\}(S(a, b))
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_avg"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Pape link: \doi{10.1093/bioinformatics/btg153}.
}


\subsection{GroupSim_pairwise_max}{

This is the maximal \code{S(a, b)} among all pairs of terms in \code{group1} and \code{group2}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = max_\{a in group1, b in group2\}(S(a, b))
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_max"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1109/TCBB.2005.50}.
}


\subsection{GroupSim_pairwise_BMA}{

BMA stands for "best-match average". First define similarity of a term to a group of terms as

\if{html}{\out{<div class="sourceCode">}}\preformatted{S(x, group) = max_\{y in group\}(x, y)
}\if{html}{\out{</div>}}

which is the most similar terms in \code{group} to \code{x}.

Then the BMA similarity is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 0.5*(mean_\{a in group1\}(S(a, group2)) + mean_\{b in group2\}(S(b, group1)))
}\if{html}{\out{</div>}}

So it is the average of the similarity of every term in \code{group1} to the whole \code{group2} and every term in \code{group2} to the whole \code{group1}.

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_BMA"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1155/2012/975783}.
}


\subsection{GroupSim_pairwise_BMM}{

BMM stands for "best-match max". It is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = max(mean_\{a in group1\}(S(a, group2)), mean_\{b in group2\}(S(b, group1)))
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_BMM"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1186/1471-2105-7-302}.
}


\subsection{GroupSim_pairwise_ABM}{

ABM stands for "average best-match". It is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = (sum_\{a in group1\}(S(a, group2)) + sum_\{b in group2\}(S(b, group1)))/(n1 + n2)
}\if{html}{\out{</div>}}

where \code{n1} and \code{n2} are the number of terms in \code{group1} and \code{group2}.

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_ABM"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1186/1471-2105-14-284}.
}


\subsection{GroupSim_pairwise_HDF}{

First define the distance of a term to a group of terms:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(x, group) = 1 - S(x, group)
}\if{html}{\out{</div>}}

Then the Hausdorff distance between two groups are:

\if{html}{\out{<div class="sourceCode">}}\preformatted{HDF = max(max_\{a in group1\}(D(a, group2)), max_\{b in group2\}(D(b, group1)))
}\if{html}{\out{</div>}}

This final similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 1 - HDF
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_HDF"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.
}


\subsection{GroupSim_pairwise_MHDF}{

Instead of using the maximal distance from a group to the other group, MHDF uses mean distance:

\if{html}{\out{<div class="sourceCode">}}\preformatted{MHDF = max(mean_\{a in group1\}(D(a, group2)), mean_\{b in group2\}(D(b, group1)))
}\if{html}{\out{</div>}}

This final similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 1 - MHDF
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_MHDF"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1109/ICPR.1994.576361}.
}


\subsection{GroupSim_pairwise_VHDF}{

It is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{VHDF = 0.5*(sqrt(mean_\{a in group1\}(D(a, group2)^2)) + sqrt(mean_\{b in group2\}(D(b, group1)^2)))
group_sim = 1 - VHDF
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_VHDF"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1073/pnas.0702965104}.
}


\subsection{GroupSim_pairwise_Froehlich_2007}{

The similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = exp(-HDF(group1, group2))
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_Froehlich_2007"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1186/1471-2105-8-166}.
}


\subsection{GroupSim_pairwise_Joeng_2014}{

Similar to \emph{VHDF}, but it directly uses the similarity:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 0.5*(sqrt(mean_\{a in group1\}(S(a, group2)^2)) + sqrt(mean_\{b in group2\}(S(b, group1)^2)))
}\if{html}{\out{</div>}}

The term semantic similarity method and the IC method can be set via \code{control} argument:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_pairwise_Joeng_2014"
    control = list(term_sim_method = "Sim_Lin_1998", IC_method = "IC_annotation")`.
}\if{html}{\out{</div>}}

Other parameters for the \code{term_sim_method} can also be set in the \code{control} list.

Paper link: \doi{10.1109/TCBB.2014.2343963}.
}


\subsection{GroupSim_SimALN}{

It is based on the average distances between every pair of terms in the two groups:

\if{html}{\out{<div class="sourceCode">}}\preformatted{exp(-mean_\{a in group1, b in group2\}(d(a, b)))
}\if{html}{\out{</div>}}

\code{d(a, b)} is the distance between \code{a} and \code{b}, which can be the shortest distance between the two terms or
the longest distnace via LCA.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim(dag, group1, group2, method = "GroupSim_SimALN",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1109/ISCC.2008.4625763}.
}


\subsection{GroupSim_SimGIC}{

Denote \code{A} and \code{B} as the two sets of ancestors terms of terms in \code{group1} and \code{group2} respectively,
the SimGIC is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = sum_\{x in intersect(A, B)\}(IC(x))/sum_\{x in union(A, B)\}(IC(x))
}\if{html}{\out{</div>}}

IC method can be set via \code{control = list(IC_method = ...)}.
}


\subsection{GroupSim_SimDIC}{

Similar as \emph{GroupSim_SimGIC}, it calculates the Dice coeffcient:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 2*sum_\{x in intersect(A, B)\}(IC(x))/(sum_\{x in A\}(IC(x)) + sum_\{x in B\}(IC(x)))
}\if{html}{\out{</div>}}

IC method can be set via \code{control = list(IC_method = ...)}.
}


\subsection{GroupSim_SimUIC}{

Similar as \emph{GroupSim_SimGIC}, it is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = sum_\{x in intersect(A, B)\}(IC(x))/max(sum_\{x in A\}(IC(x)), sum_\{x in B\}(IC(x)))
}\if{html}{\out{</div>}}

IC method can be set via \code{control = list(IC_method = ...)}.
}


\subsection{GroupSim_SimUI}{

It is only based on the number of terms. \code{A} is the set of all ancestors of group1 terms and \code{B}
is the set of all ancestors of group2 terms.

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = length(intersect(A, B))/length(union(A, B))
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_SimDB}{

It is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = 2*length(intersect(A, B))/(length(A) + length(B))
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_SimUB}{

It is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = length(intersect(A, B))/max(length(A), length(B))
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_SimNTO}{

It is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = length(intersect(A, B))/min(length(A), length(B))
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_SimCOU}{

It is based on the dot product of two vectors \code{p} and \code{q} which correspond to terms in \code{group1} and \code{group2}.
\code{p} and \code{q} have the same length as the total number of terms. Value of position i in \code{p} or \code{q} corresponds to term \code{t}. The value
takes \code{IC(t)} if \code{t} is an ancestor of any term in \code{p} or \code{q}, and the value takes zero if \code{t} is not. The
similarity betweem \code{group1} terms and \code{group2} terms is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{<p,q>/||p||/||q||
}\if{html}{\out{</div>}}

where \verb{<p,q>} is the dot product between the two, and \verb{||p||} or \verb{||q||} is the norm of the vector.
The equation can be written as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = sum_\{x in intersect(A, B)\}(IC(x)^2) / 
              sqrt(sum_\{x in A\}(IC(x)^2)) / 
              sqrt(sum_\{x in B\}(IC(x)^2))
}\if{html}{\out{</div>}}

IC method can be set via \code{control = list(IC_method = ...)}.
}


\subsection{GroupSim_SimCOT}{

Similar as \emph{GroupSim_SimCOU}, the similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{<p,q>/(||p||^2 + ||q||^2 - <p,q>)
}\if{html}{\out{</div>}}

And it can be rewritten as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = sum_\{x in intersect(A, B)\}(IC(x)^2) /
    (sum_\{x in A\}(IC(x)^2) + sum_\{x in B\}(IC(x)^2) - sum_\{x in intersect(A, B)\}(IC(x)^2))
}\if{html}{\out{</div>}}

IC method can be set via \code{control = list(IC_method = ...)}.
}


\subsection{GroupSim_SimLP}{

It is the longest depth for the terms in \code{intersect(A, B)}.

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = max(depth(intersect(A, B)))
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_Ye_2005}{

It is a normalized version of \emph{GroupSim_SimLP}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = max(depth(intersect(A, B)))/max_depth
}\if{html}{\out{</div>}}

Since the minimal depth is zero for root.
}


\subsection{GroupSim_SimCHO}{

It is based on the annotated items. Denote \code{sigma(t)} as the total annotated items of \code{t}. The similarity is calculated as

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = log(C/sigma_max)/log(sigma_min/sigma_max)
}\if{html}{\out{</div>}}

where \code{C} is \verb{min(sigma_\{x in intersect(A, B)\}(x))}, i.e., the minimal sigma in the intersection of group1 and group2. Note
Now \code{A} and \code{B} are just two sets of terms in \code{group1} and \code{group2}.
\code{sigma_max} is the total number of items annotated to the DAG, \code{sig_min} is the minimal number of items annotated to a term, which
is mostly 1.
}


\subsection{GroupSim_SimALD}{

\code{A} and \code{B} are just two sets of terms in \code{group1} and \code{group2}. The similarity is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{group_sim = max_\{t in intersect(A, B)\}(1 - sigma(t)/N)
}\if{html}{\out{</div>}}
}


\subsection{GroupSim_Jaccard}{

Say \code{A} is the set of items annotated to terms in \code{group1} and \code{B} is the set of items annotated to \code{group2}.
This is the Jaccard coeffcient between two sets.

The universe/background can be set via \code{control = list(universe = ...)}.
}


\subsection{GroupSim_Dice}{

It is the Dice coeffcient between \code{A} and \code{B}.

The universe/background can be set via \code{control = list(universe = ...)}.
}


\subsection{GroupSim_Overlap}{

It is the Overlap coeffcient between \code{A} and \code{B}.

The universe/background can be set via \code{control = list(universe = ...)}.
}


\subsection{GroupSim_Kappa}{

The universe/background can be set via \code{control = list(universe = ...)}.
}
}

\examples{
parents  = c("a", "a", "b", "b", "c", "d")
children = c("b", "c", "c", "d", "e", "f")
annotation = list(
    "a" = c("t1", "t2", "t3"),
    "b" = c("t3", "t4"),
    "c" = "t5",
    "d" = "t7",
    "e" = c("t4", "t5", "t6", "t7"),
    "f" = "t8"
)
dag = create_ontology_DAG(parents, children, annotation = annotation)
group_sim(dag, c("c", "e"), c("d", "f"), 
    method = "GroupSim_pairwise_avg", 
    control = list(term_sim_method = "Sim_Lin_1998")
)
}
