% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{splitDataByGene}
\alias{splitDataByGene}
\title{Split methylation data into regions based on the genes annotations}
\usage{
splitDataByGene(
  dat,
  chr,
  organism = "human",
  build = "hg38",
  types = "promoter",
  gap = -1,
  min.cpgs = 50,
  max.cpgs = 2000,
  verbose = TRUE
)
}
\arguments{
\item{dat}{a data frame with rows as individual CpGs appearing
in all the samples. The first 4 columns should contain the information of
\code{Meth_Counts} (methylated counts), \code{Total_Counts} (read depths),
\code{Position} (Genomic position for the CpG site) and \code{ID}(sample ID).
The covariate information, such as disease status or cell type composition,
are listed in column 5 and onwards.}

\item{chr}{character vector containing the chromosome information. Its length
should be equal to the number of rows in \code{dat}.}

\item{organism}{character defining the organism of interest
Only Homo sapiens (\code{"human"}) is available.
Additional packages are required for Mus musculus (\code{"mouse"}),
Rattus norvegicus (\code{"rat"}) and Drosophila melanogaster (\code{"fly"}).
The matching is case-insensitive. The default value is \code{"human"}.}

\item{build}{character defining the version of the genome build on which the
methylation data have been mapped. By default, the build is set to
\code{"hg38"}, however the build \code{"hg19"} is also available for
Homo sapiens:
Once the additional packages are installed, the following organisms and
builds are available:
\itemize{
\item \code{"mm9"} and \code{"mm10"} for Mus musculus;
\item \code{"rn4"}, \code{"rn5"} and \code{"rn6"} for Rattus norvegicus;
\item \code{"dm3"} and \code{"dm6"} for Drosophila melanogaster;
}}

\item{types}{character vector defining the type of genic annotations
to use among the following options:
\itemize{
\item \code{"upstream"} for the annotations included 1-5Kb upstream of the TSS;
\item \code{"promoter"} for the annotations included < 1Kb upstream of the TSS;
\item \code{"threeprime"} for the annotations included in 3' UTR;
\item \code{"fiveprime"} for the annotations included in the 5' UTR;
\item \code{"exon"} for the annotations included in the exons;
\item \code{"intron"} for the annotations included in the introns;
\item \code{"all"} for all the annotations aforementioned.
The default value is \code{"promoter"}.
}}

\item{gap}{this integer defines the maximum gap allowed between two regions
to be considered as overlapping.
According to the \code{GenomicRanges::findOverlaps} function,
the gap between 2 ranges is the number of positions that separate them.
The gap between 2 adjacent ranges is 0. By convention when one range has
its start or end strictly inside the other (i.e. non-disjoint ranges),
the gap is considered to be -1.
Decimal values will be rounded to the nearest integer.
The default value is \code{-1}.}

\item{min.cpgs}{positive integer defining the minimum number of
CpGs within a region for the algorithm to perform optimally.
The default value is 50.}

\item{max.cpgs}{positive integer defining the maximum number of
CpGs within a region for the algorithm to perform optimally.
The default value is 2000.}

\item{verbose}{logical indicates if the algorithm should provide progress
report information.
The default value is TRUE.}
}
\value{
A named \code{list} of \code{data.frame} containing the data of each
independent region.
}
\description{
This function splits the methylation data into regions
based on the genes. The annotations are coming from the Bioconductor
package \code{annnotatr}.
}
\examples{
#------------------------------------------------------------#
data(RAdat)
# Add a column containing the chromosome information
RAdat$Chr <- "chr4"
RAdat.f <- na.omit(RAdat[RAdat$Total_Counts != 0, ])
results <- splitDataByGene(dat = RAdat.f, 
chr = rep(x = "chr1", times = nrow(RAdat.f)), verbose = FALSE)

}
\author{
Audrey Lemaçon
}
