% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/All_Methods.R
\name{SWeePlite}
\alias{SWeePlite}
\alias{SWeePlite,AAStringSet-method}
\alias{SWeePlite,DNAStringSet-method}
\alias{SWeePlite,RNAStringSet-method}
\alias{SWeePlite,BStringSet-method}
\alias{SWeePlite,BString-method}
\alias{SWeePlite,character-method}
\alias{SWeePlite,array-method}
\alias{SWeePlite,integer-method}
\alias{SWeePlite,matrix-method}
\alias{SWeePlite,dgCMatrix-method}
\title{Spaced Words Projection lite}
\usage{
SWeePlite(input, psz, bin = FALSE, ncores = NULL, ...)

\S4method{SWeePlite}{AAStringSet}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  concatenate = FALSE,
  mask = NULL,
  seqtype = NULL,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{DNAStringSet}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  concatenate = FALSE,
  mask = NULL,
  seqtype = NULL,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{RNAStringSet}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  concatenate = FALSE,
  mask = NULL,
  seqtype = NULL,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{BStringSet}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  concatenate = FALSE,
  mask = NULL,
  seqtype = NULL,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{BString}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  concatenate = FALSE,
  mask = NULL,
  seqtype = NULL,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{character}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  norm = "none",
  mask = NULL,
  extension = "",
  seqtype = "AA",
  lowRAMmode = TRUE,
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{array}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  transpose = FALSE,
  RNAseqdata = FALSE,
  norm = "none",
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{integer}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  transpose = FALSE,
  RNAseqdata = FALSE,
  norm = "none",
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{matrix}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  transpose = FALSE,
  RNAseqdata = FALSE,
  norm = "none",
  nk = 15000,
  verbose = TRUE
)

\S4method{SWeePlite}{dgCMatrix}(
  input,
  psz = 1369,
  bin = FALSE,
  ncores = NULL,
  transpose = FALSE,
  RNAseqdata = FALSE,
  norm = "none",
  nk = 15000,
  verbose = TRUE
)
}
\arguments{
\item{input}{There are four input formats available:
(a) `BStringSet' (variants: `AAStringSet', `RNAStringSet', `DNAStringSet'). Biological sequence format loaded in memory;
(b) `character' String containing a path to a folder with FASTA files;
(c) `dgCMatrix' Expression matrix loaded with Seurat package (mtx pattern).
(d) `matrix' (variants: `array',`integer'). Generic matrix.}

\item{psz}{projection size. Default 1369}

\item{bin}{binary mode (TRUE), or counting mode (FALSE) for HDV construction. Default is FALSE.}

\item{ncores}{Number of CPU cores used for parallel processing. Default is 2.}

\item{...}{other arguments of the function itself}

\item{norm}{normalization of HDV. This must be one of 'none', 'log' or 'logNeg'. 'none' is no normalization, 
'log' is simple logarithm, ´Neg´ to convert nulls into -1, ´logNeg´ option is indicated for analyzing genes and short sequences.
Default is ´none´.}

\item{concatenate}{defines whether to treat each sequence individually or to concatenate them into a single sequence
Available only for inputs in biological sequence format. The default is FALSE.}

\item{mask}{reading mask. Available only for inputs in biological sequence format or path for FASTA files. Default c(2,1,2)}

\item{seqtype}{type of data: ´AA´ for amino acid, ´NT´ for nucleotide. Available only for inputs in biological sequence format or path for FASTA files. The default is AA}

\item{nk}{Step size of HDV for parallel loop. Default is 50000.}

\item{verbose}{verbose mode. The default is TRUE}

\item{extension}{extension of files desired to concatenate (Optional).  Available only for input type path to folder with FASTA files.}

\item{lowRAMmode}{lowRAMmode is suitable for reading large files individually, such as complete genomes, when the machine's memory is limited. 
read one FASTA at a time, recommended for large files such as complete eukaryotic genomes or proteomes. The default is FALSE}

\item{transpose}{If the rows correspond to the samples and the columns correspond to the genes (mtx pattern), 
use transpose=FALSE. If the columns correspond to the samples, use transpose=TRUE. 
Available only for inputs of the expression matrix or generic matrix type. 
The default setting is FALSE}

\item{RNAseqdata}{For RNAseq data use 'TRUE' or apply the parameter `transpose=TRUE'. Default is FALSE.}
}
\value{
`SWeePlite' returns a `list' containing the following components:
\itemize{
  \item proj: a `numeric` matrix with `m` columns and one line per sequence, each row corresponding to a compact vector
  \item info: aditional information of the process. This object is subdivided in: 
  \itemize{
  \item ProjectionSize: a `integer` corresponding to `psz`
  \item bin: bin: a `boolean' containing if binary (TRUE) or counting (FALSE)
  \item mask: a `vector` containing the mask used
  \item SequenceType: a `character' containing the type of the sequence (amino acid: AA, ou nucleotide: NT)
  \item concatenate : a `boolean` corresponding to the concatenation of sequences
  \item version : a `character` corresponding to the version of the package
  \item norm : a `character` containing the normalization used
  \item extension: a `character' containing the list of extensions considered
  \item timeElapsed: a `double' containing the elapsed time in seconds
  \item headers : list of headers for each analyzed sequence
} 
}
}
\description{
Spaced Words Projection version lite (SWeePlite) is an alignment-free method for the vector representation 
of the biological sequences (amino acid and nucleotide). Analogous to the ´SWeeP´ function (De Pierri, 2020), 
´SWeePlite´ has optimizations in its implementation that allow the use of larger read masks with low RAM 
consumption. It also eliminates the need to supply the orthonormal matrix (it is generated internally).
Each sequence provided is represented by a compact numerical vector which is easy to analyze. 
The method is based on k-mers counting and random projection. Details of the methodology can be found 
in the reference (De Pierri, 2020). The function allows general dimensionality reduction of RNAseq 
data and generic matrices.
}
\details{
The normalization option 'logNeg' applies a simple logarithm to the HDV matrix. 
Its difference from 'log' is the conversion of zeros to -1 in HDV.
}
\examples{

# get the path to the folder containing the FASTA files
path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')

# define the parameters
mask = c(2,1,2)
psz = 1369

# get the vectors that represent the sequences
LDV = SWeePlite(input=path,extension=c('.faa','.fas','.fasta'),
                psz = psz,mask=mask,bin=FALSE,seqtype='AA',ncores=2)

}
\references{
De Pierri, C. R., et al. (2020). SWeeP: representing large biological sequences datasets  
in compact vectors. Scientific reports, 10(1):1–10.
}
\author{
Camila P. Perico
}
