#' @title Create the Profile GDS file(s) for one or multiple specific profiles
#' using the information from a RDS Sample description file and the 1KG
#' GDS file
#'
#' @description The function uses the information for the Reference GDS file
#' and the RDS Sample Description file to create the Profile GDS file. One
#' Profile GDS file is created per profile. One Profile GDS file will be
#' created for each entry present in the \code{listProfiles} parameter.
#'
#' @param pathGeno a \code{character} string representing the path to the
#' directory containing the VCF output of SNP-pileup for each sample. The
#' SNP-pileup files must be compressed (gz files) and have the name identifiers
#' of the samples. A sample with "Name.ID" identifier would have an
#' associated file called
#' if genoSource is "VCF", then "Name.ID.vcf.gz",
#' if genoSource is "generic", then "Name.ID.generic.txt.gz"
#' if genoSource is "snp-pileup", then "Name.ID.txt.gz".
#'
#' @param filePedRDS a \code{character} string representing the path to the
#' RDS file that contains the information about the sample to analyse.
#' The RDS file must
#' include a \code{data.frame} with those mandatory columns: "Name.ID",
#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in
#' \code{character} strings. The \code{data.frame}
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy}
#' can be defined.
#'
#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID",
#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in
#' \code{character} strings (no factor). The \code{data.frame}
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy}
#' can be defined.
#'
#' @param fileNameGDS a \code{character} string representing the file name of
#' the Reference GDS file. The file must exist.
#'
#' @param batch a single positive \code{integer} representing the current
#' identifier for the batch. Beware, this field is not stored anymore.
#' Default: \code{1}.
#'
#' @param studyDF a \code{data.frame} containing the information about the
#' study associated to the analysed sample(s). The \code{data.frame} must have
#' those 3 columns: "study.id", "study.desc", "study.platform". All columns
#' must be in \code{character} strings (no factor).
#'
#' @param listProfiles a \code{vector} of \code{character} string corresponding
#' to the profile identifiers that will have a Profile GDS file created. The
#' profile identifiers must be present in the "Name.ID" column of the Profile
#' RDS file passed to the \code{filePedRDS} parameter.
#' If \code{NULL}, all profiles present in the \code{filePedRDS} are selected.
#' Default: \code{NULL}.
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the Profile GDS files will be created.
#' Default: \code{NULL}.
#'
#' @param genoSource a \code{character} string with two possible values:
#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param verbose a \code{logical} indicating if message information should be
#' printed. Default: \code{FALSE}.
#'
#' @return The function returns \code{0L} when successful.
#'
#' @examples
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata/tests", package="RAIDS")
#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds")
#'
#' ## The data.frame containing the information about the study
#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform"
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' studyDF <- data.frame(study.id = "MYDATA",
#'                         study.desc = "Description",
#'                         study.platform = "PLATFORM",
#'                         stringsAsFactors = FALSE)
#'
#' ## The data.frame containing the information about the samples
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' samplePED <- data.frame(Name.ID=c("ex1", "ex2"),
#'                     Case.ID=c("Patient_h11", "Patient_h12"),
#'                     Diagnosis=rep("Cancer", 2),
#'                     Sample.Type=rep("Primary Tumor", 2),
#'                     Source=rep("Databank B", 2), stringsAsFactors=FALSE)
#' rownames(samplePED) <- samplePED$Name.ID
#'
#' ## Create the Profile GDS File for samples in 'listSamples' vector
#' ## (in this case, samples "ex1")
#' ## The Profile GDS file is created in the pathProfileGDS directory
#' result <- createStudy2GDS1KG(pathGeno=dataDir,
#'             pedStudy=samplePED, fileNameGDS=fileGDS,
#'             studyDF=studyDF, listProfiles=c("ex1"),
#'             pathProfileGDS=tempdir(),
#'             genoSource="snp-pileup",
#'             verbose=FALSE)
#'
#' ## The function returns OL when successful
#' result
#'
#' ## The Profile GDS file 'ex1.gds' has been created in the
#' ## specified directory
#' list.files(tempdir())
#'
#' ## Remove Profile GDS file (created for demo purpose)
#' unlink(file.path(tempdir(), "ex1.gds"), force=TRUE)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt createfn.gds put.attr.gdsn closefn.gds read.gdsn
#' @importFrom S4Vectors isSingleNumber
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
createStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"),
                                filePedRDS=NULL, pedStudy=NULL, fileNameGDS,
                                batch=1, studyDF, listProfiles=NULL,
                                pathProfileGDS=NULL,
                                genoSource=c("snp-pileup", "generic", "VCF"),
                                verbose=FALSE) {

    ## When filePedRDS is defined and pedStudy is null
    if (!(is.null(filePedRDS)) && is.null(pedStudy)) {
        ## The filePedRDS must be a character string and the file must exists
        if (!(is.character(filePedRDS) && (file.exists(filePedRDS)))) {
            stop("The \'filePedRDS\' must be a character string representing",
                    " the RDS Sample information file. The file must exist.")
        }
        ## Open the RDS Sample information file
        pedStudy <- readRDS(file=filePedRDS)
    } else if (!(is.null(filePedRDS) || is.null(pedStudy))) {
        stop("Both \'filePedRDS\' and \'pedStudy\' parameters cannot be ",
                "defined at the same time.")
    } else if (is.null(filePedRDS) && is.null(pedStudy)) {
        stop("One of the parameter \'fineNamePED\' of \'pedStudy\' must ",
                "be defined.")
    }

    ## Validate input parameters
    validateCreateStudy2GDS1KG(pathGeno=pathGeno, pedStudy=pedStudy,
        fileNameGDS=fileNameGDS, batch=batch, studyDF=studyDF,
        listProfiles=listProfiles, pathProfileGDS=pathProfileGDS,
        genoSource=genoSource, verbose=verbose)

    genoSource <- arg_match(genoSource)

    ## Read the Reference GDS file
    gdsReference <- snpgdsOpen(filename=fileNameGDS)

    ## Extract the chromosome and position information for all SNPs in 1KG GDS
    snpCHR <- index.gdsn(node=gdsReference, "snp.chromosome")
    snpPOS <- index.gdsn(node=gdsReference, "snp.position")

    ## Create a data.frame containing the information
    listPos <- data.frame(snp.chromosome=read.gdsn(snpCHR),
                            snp.position=read.gdsn(snpPOS))

    if(verbose) {
        message("Start ", Sys.time())
        message("Sample info DONE ", Sys.time())
    }

    generateGDS1KGgenotypeFromSNPPileup(pathGeno=pathGeno,
        listSamples=listProfiles, listPos=listPos, offset=-1, minCov=10,
        minProb=0.999, seqError=0.001, dfPedProfile=pedStudy, batch=batch,
        studyDF=studyDF, pathProfileGDS=pathProfileGDS,
        genoSource=genoSource, verbose=verbose)

    if(verbose) {
        message("Genotype DONE ", Sys.time())
    }

    ## Close 1KG GDS file
    closefn.gds(gdsReference)

    ## Return successful code
    return(0L)
}


#' @title Compute the list of pruned SNVs for a specific profile using the
#' information from the Reference GDS file and a linkage disequilibrium
#' analysis
#'
#' @description This function computes the list of pruned SNVs for a
#' specific profile. When
#' a group of SNVs are in linkage disequilibrium, only one SNV from that group
#' is retained. The linkage disequilibrium is calculated with the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. The initial list of
#' SNVs that are passed to the \code{\link[SNPRelate]{snpgdsLDpruning}}()
#' function can be specified by the user.
#'
#' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS
#' file), the 1 KG GDS file (reference data set).
#'
#' @param method a \code{character} string that represents the method that will
#' be used to calculate the linkage disequilibrium in the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. The 4 possible values
#' are: "corr", "r", "dprime" and "composite". Default: \code{"corr"}.
#'
#' @param currentProfile  a \code{character} string
#' corresponding to the profile identifier used in LD pruning done by the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. A Profile GDS file
#' corresponding to the profile identifier must exist and be located in the
#' \code{pathProfileGDS} directory.
#'
#' @param studyID a \code{character} string corresponding to the study
#' identifier used in the \code{\link[SNPRelate]{snpgdsLDpruning}} function.
#' The study identifier must be present in the Profile GDS file.
#'
#' @param listSNP a \code{vector} of SNVs identifiers specifying selected to
#' be passed the the pruning function;
#' if \code{NULL}, all SNVs are used in the
#' \code{\link[SNPRelate]{snpgdsLDpruning}} function. Default: \code{NULL}.
#'
#' @param slideWindowMaxBP a single positive \code{integer} that represents
#' the maximum basepairs (bp) in the sliding window. This parameter is used
#' for the LD pruning done in the \code{\link[SNPRelate]{snpgdsLDpruning}}
#' function.
#' Default: \code{500000L}.
#'
#' @param thresholdLD a single \code{numeric} value that represents the LD
#' threshold used in the \code{\link[SNPRelate]{snpgdsLDpruning}} function.
#' Default: \code{sqrt(0.1)}.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param verbose a \code{logicial} indicating if information is shown
#' during the process in the \code{\link[SNPRelate]{snpgdsLDpruning}}
#' function.  Default: \code{FALSE}.
#'
#' @param chr a \code{character} string representing the chromosome where the
#' selected SNVs should belong. Only one chromosome can be handled. If
#' \code{NULL}, the chromosome is not used as a filtering criterion.
#' Default: \code{NULL}.
#'
#' @param superPopMinAF a single positive \code{numeric} representing the
#' minimum allelic frequency used to select the SNVs. If \code{NULL}, the
#' allelic frequency is not used as a filtering criterion. Default: \code{NULL}.
#'
#' @param keepPrunedGDS a \code{logicial} indicating if the information about
#' the pruned SNVs should be added to the GDS Sample file.
#' Default: \code{TRUE}.
#'
#' @param pathProfileGDS a \code{character} string representing the directory
#' where the Profile GDS files will be created. The directory must exist.
#'
#' @param keepFile a \code{logical} indicating if RDS files containing the
#' information about the pruned SNVs must be
#' created. Default: \code{FALSE}.
#'
#' @param pathPrunedGDS a \code{character} string representing an existing
#' directory. The directory must exist. Default: \code{"."}.
#'
#' @param outPrefix a \code{character} string that represents the prefix of the
#' RDS files that will be generated. The RDS files are only generated when
#' the parameter \code{keepFile}=\code{TRUE}. Default: \code{"pruned"}.
#'
#' @return The function returns \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(gdsfmt)
#'
#' ## Path to the demo Reference GDS file is located in this package
#' dataDir <- system.file("extdata/tests", package="RAIDS")
#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds")
#'
#' ## The data.frame containing the information about the study
#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform"
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' studyDF <- data.frame(study.id = "MYDATA",
#'                         study.desc = "Description",
#'                         study.platform = "PLATFORM",
#'                         stringsAsFactors = FALSE)
#'
#' ## The data.frame containing the information about the samples
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' samplePED <- data.frame(Name.ID = c("ex1", "ex2"),
#'                     Case.ID = c("Patient_h11", "Patient_h12"),
#'                     Diagnosis = rep("Cancer", 2),
#'                     Sample.Type = rep("Primary Tumor", 2),
#'                     Source = rep("Databank B", 2), stringsAsFactors = FALSE)
#' rownames(samplePED) <- samplePED$Name.ID
#'
#' ## Temporary Profile GDS file
#' profileFile <- file.path(tempdir(), "ex1.gds")
#'
#' ## Copy the Profile GDS file demo that has not been pruned yet
#' file.copy(file.path(dataDir, "ex1_demo.gds"), profileFile)
#'
#' ## Open 1KG file
#' gds1KG <- snpgdsOpen(fileGDS)
#'
#' ## Compute the list of pruned SNVs for a specific profile 'ex1'
#' ## and save it in the Profile GDS file 'ex1.gds'
#' pruningSample(gdsReference=gds1KG, currentProfile=c("ex1"),
#'               studyID = studyDF$study.id, pathProfileGDS=tempdir())
#'
#' ## Close the Reference GDS file (important)
#' closefn.gds(gds1KG)
#'
#' ## Check content of Profile GDS file
#' ## The 'pruned.study' entry should be present
#' content <- openfn.gds(profileFile)
#' content
#'
#' ## Close the Profile GDS file (important)
#' closefn.gds(content)
#'
#' ## Remove Profile GDS file (created for demo purpose)
#' unlink(profileFile, force=TRUE)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
pruningSample <- function(gdsReference,
                            method=c("corr", "r", "dprime", "composite"),
                            currentProfile,
                            studyID,
                            listSNP=NULL,
                            slideWindowMaxBP=500000L,
                            thresholdLD=sqrt(0.1),
                            np=1L,
                            verbose=FALSE,
                            chr=NULL,
                            superPopMinAF=NULL,
                            keepPrunedGDS=TRUE,
                            pathProfileGDS=NULL,
                            keepFile=FALSE,
                            pathPrunedGDS=".", outPrefix="pruned") {

    ## Validate input parameters
    validatePruningSample(gdsReference=gdsReference, method=method,
        currentProfile=currentProfile, studyID=studyID, listSNP=listSNP,
        slideWindowMaxBP=slideWindowMaxBP, thresholdLD=thresholdLD, np=np,
        verbose=verbose, chr=chr, superPopMinAF=superPopMinAF,
        keepPrunedGDS=keepPrunedGDS, pathProfileGDS=pathProfileGDS,
        keepFile=keepFile, pathPrunedGDS=pathPrunedGDS, outPrefix=outPrefix)

    ## Matches a character method against a table of candidate values
    method <- arg_match(method)

    ## Profile GDS file name
    fileGDSSample <-  validateProfileGDSExist(pathProfile=pathProfileGDS,
                                                    profile=currentProfile)

    filePruned <- file.path(pathPrunedGDS, paste0(outPrefix, ".rds"))
    fileObj <- file.path(pathPrunedGDS, paste0(outPrefix, ".Obj.rds"))

    snp.id <- read.gdsn(node=index.gdsn(gdsReference, "snp.id"))
    sample.id <- read.gdsn(node=index.gdsn(gdsReference, "sample.id"))

    ## Open the GDS Sample file
    gdsSample <- openfn.gds(filename=fileGDSSample)

    ## Extract all study information from the GDS Sample file
    study.annot <- read.gdsn(node=index.gdsn(gdsSample, "study.annot"))

    ## Select study information associated to the current profile
    posSample <- which(study.annot$data.id == currentProfile &
                            study.annot$study.id == studyID)

    ## Check that the information is found for the specified profile and study
    if(length(posSample) != 1) {
        closefn.gds(gdsSample)
        stop("In pruningSample the profile \'", currentProfile,
                "\' doesn't exists for the study \'", studyID, "\'\n")
    }

    ## Get the SNV genotype information for the current profile
    g <- read.gdsn(index.gdsn(gdsSample, "geno.ref"),
                    start=c(1, posSample), count=c(-1,1))

    ## Close the Profile GDS file
    closefn.gds(gdsSample)

    listGeno <- which(g != 3)
    rm(g)

    listKeepPos <- listGeno

    ## Select SNVs based on the chromosome
    if (!is.null(chr)) {
        snpCHR <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome"))
        listKeepPos <- intersect(which(snpCHR == chr), listKeepPos)
    }

    ## Select SNVs based on the minimum allele frequency in the populations
    if (!is.null(superPopMinAF)) {
        listTMP <- NULL
        for(sp in c("EAS", "EUR", "AFR", "AMR", "SAS")) {
            snpAF <- read.gdsn(index.gdsn(gdsReference,
                                            paste0("snp.", sp, "_AF")))
            listTMP <- union(listTMP,
                which(snpAF >= superPopMinAF & snpAF <= 1 - superPopMinAF))
        }
        listKeepPos <- intersect(listTMP, listKeepPos)
    }

    if (length(listKeepPos) == 0) {
        stop("In pruningSample, the sample ", currentProfile,
                " doesn't have SNPs after filters\n")
    }
    listKeep <- snp.id[listKeepPos]

    sample.ref <- read.gdsn(index.gdsn(gdsReference, "sample.ref"))
    listSamples <- sample.id[which(sample.ref == 1)]

    ## Use a LD analysis to generate a subset of SNPs
    snpset <- runLDPruning(gds=gdsReference, method=method,
        listSamples=listSamples, listKeep=listKeep,
        slideWindowMaxBP=slideWindowMaxBP, thresholdLD=thresholdLD,
        np=np, verbose=verbose)
    pruned <- unlist(snpset, use.names=FALSE)

    ## When TRUE, generate 2 RDS file with the pruned SNVs information
    if (keepFile) {
        saveRDS(pruned, filePruned)
        saveRDS(snpset, fileObj)
    }

    ## When TRUE, add the pruned SNvs information to the Profile GDS file
    if (keepPrunedGDS) {
        gdsSample <- openfn.gds(filename=fileGDSSample, readonly=FALSE)
        addGDSStudyPruning(gdsProfile=gdsSample, pruned=pruned)
        closefn.gds(gdsfile=gdsSample)
    }

    return(0L)
}


#' @title Add the genotype information for the list of pruned SNVs
#' into the Profile GDS file
#'
#' @description The function extracts the information about the pruned SNVs
#' from the 1KG GDS file and adds entries related to the pruned SNVs in
#' the Profile GDS file. The nodes are added to the Profile GDS file:
#' 'sample.id', 'snp.id', 'snp.chromosome', 'snp.position', 'snp.index',
#' 'genotype' and 'lap'.
#'
#' @param gdsReference an object of class
#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.
#'
#' @param fileProfileGDS a \code{character} string representing the path and
#' file name of the Profile GDS file. The Profile GDS file must exist.
#'
#' @param currentProfile a \code{character} string corresponding to the sample
#' identifier associated to the current list of pruned SNVs.
#'
#' @param studyID a \code{character} string corresponding to the study
#' identifier associated to the current list of pruned SNVs.
#'
#' @return The function returns \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata/tests", package="RAIDS")
#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds")
#'
#' ## The data.frame containing the information about the study
#' ## The 3 mandatory columns: "studyID", "study.desc", "study.platform"
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' studyDF <- data.frame(study.id="MYDATA",
#'                         study.desc="Description",
#'                         study.platform="PLATFORM",
#'                         stringsAsFactors=FALSE)
#'
#' ## Temporary Profile file
#' fileProfile <- file.path(tempdir(), "ex2.gds")
#'
#' ## Copy required file
#' file.copy(file.path(dataDir, "ex1_demo_with_pruning.gds"),
#'         fileProfile)
#'
#' ## Open 1KG file
#' gds1KG <- snpgdsOpen(fileGDS)
#'
#' ## Compute the list of pruned SNVs for a specific profile 'ex1'
#' ## and save it in the Profile GDS file 'ex2.gds'
#' add1KG2SampleGDS(gdsReference=gds1KG,
#'         fileProfileGDS=fileProfile,
#'         currentProfile=c("ex1"),
#'         studyID=studyDF$study.id)
#'
#' ## Close the 1KG GDS file (important)
#' closefn.gds(gds1KG)
#'
#' ## Check content of Profile GDS file
#' ## The 'pruned.study' entry should be present
#' content <- openfn.gds(fileProfile)
#' content
#'
#' ## Close the Profile GDS file (important)
#' closefn.gds(content)
#'
#' ## Remove Profile GDS file (created for demo purpose)
#' unlink(fileProfile, force=TRUE)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn objdesp.gdsn
#' @encoding UTF-8
#' @export
add1KG2SampleGDS <- function(gdsReference, fileProfileGDS, currentProfile,
                                studyID) {

    ## Validate inputs
    validateAdd1KG2SampleGDS(gdsReference=gdsReference,
        gdsProfileFile=fileProfileGDS, currentProfile=currentProfile,
        studyID=studyID)

    ## Open Profile GDS file
    gdsSample <- openfn.gds(fileProfileGDS, readonly=FALSE)

    ## Extract needed information from 1KG GDS file
    snp.id <- read.gdsn(index.gdsn(gdsReference, "snp.id"))

    ## Extract list of pruned SNVs from the GDS Sample file
    pruned <- read.gdsn(index.gdsn(gdsSample, "pruned.study"))

    listSNP <- which(snp.id %in% pruned)
    listRef <- which(read.gdsn(index.gdsn(gdsReference, "sample.ref")) == 1)
    sample.id <- read.gdsn(index.gdsn(gdsReference, "sample.id"))

    snp.chromosome <- read.gdsn(index.gdsn(gdsReference,
                                            "snp.chromosome"))[listSNP]
    snp.position <-  read.gdsn(index.gdsn(gdsReference,
                                            "snp.position"))[listSNP]

    add.gdsn(gdsSample, "sample.id", c(sample.id[listRef], currentProfile))

    add.gdsn(gdsSample, "snp.id", snp.id[listSNP])
    add.gdsn(gdsSample, "snp.chromosome", snp.chromosome)
    add.gdsn(gdsSample, "snp.position", snp.position)
    # snp.index is the index of the snp pruned in snp.id from 1KG gds
    add.gdsn(gdsSample, "snp.index", listSNP)

    var.geno <- NULL

    j <- apply(matrix(c(seq_len(length(listRef)), listRef), ncol=2), 1,
            FUN=function(x, gdsReference,
                gdsSample, listSNP){
                i <- x[2]
                j <- x[1]

                g <- read.gdsn(index.gdsn(gdsReference, "genotype"),
                                start=c(1,i), count = c(-1,1))[listSNP]

                if(! ("genotype" %in% ls.gdsn(gdsSample))){
                    var.geno <- add.gdsn(gdsSample, "genotype",
                        valdim=c(length(listSNP), 1), g, storage="bit2")

                }else {
                    if(is.null(var.geno)) {
                        var.geno <- index.gdsn(gdsSample, "genotype")
                    }
                    append.gdsn(var.geno, g)
                }
                if(j %% 5 == 0) {
                    sync.gds(gdsSample)
                }
                return(NULL)
            },
            gdsReference=gdsReference,
            gdsSample=gdsSample,
            listSNP=listSNP)

    # add.gdsn(gdsSample, "SamplePos", objdesp.gdsn(index.gdsn(gdsSample,
    #  "genotype"))$dim[2] + 1,
    #          storage="int32")
    study.annot <- read.gdsn(index.gdsn(gdsSample, "study.annot"))

    posCur <- which(study.annot$data.id == currentProfile &
                            study.annot$study.id == studyID)
    if(is.null(var.geno)) {
        var.geno <- index.gdsn(gdsSample, "genotype")
    }
    g <- read.gdsn(index.gdsn(gdsSample, "geno.ref"), start=c(1, posCur),
                        count=c(-1, 1))[listSNP]
    append.gdsn(var.geno, g)

    add.gdsn(gdsSample, "lap",
        rep(0.5, objdesp.gdsn(index.gdsn(gdsSample, "genotype"))$dim[1]),
        storage="packedreal8")

    ## Close the GDS Sample file
    closefn.gds(gdsSample)

    return(0L)
}


#' @title Append information about the 1KG samples into
#' the Profile GDS file
#'
#' @description The information about the samples present in the 1KG GDS file
#' is added into the GDS Sample file. Only the information about the
#' unrelated samples
#' from the 1OOO Genome Study are copied into the GDS Sample file. The
#' information is only added to the GDS Sample file when the 1KG Study is not
#' already present in the GDS Sample file. The sample information for all
#' selected samples is appended to the GDS Sample file "study.annot" node.
#' The study information is appended to the GDS Sample file "study.list" node.
#'
#' @param gdsReference an object of class
#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.
#'
#' @param fileProfileGDS a \code{character} string representing the path and
#' file name of the GDS Sample file. The GDS Sample file must exist.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(gdsfmt)
#'
#' ## Get the temp folder
#' tempDir <- tempdir()
#'
#' ## Create a temporary 1KG GDS file and add needed information
#' fileName1KG <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_1KG.gds")
#' gds1KG  <- createfn.gds(filename=fileName1KG)
#' add.gdsn(gds1KG, "sample.id", c("HTT101", "HTT102", "HTT103"))
#'
#' samples <- data.frame(sex=c(1, 1, 2), pop.group=c("GBR", "GIH", "GBR"),
#'     superPop=c("EUR", "SAS", "EUR"), batch=rep(0, 3),
#'     stringsAsFactors = FALSE)
#'
#' add.gdsn(gds1KG, "sample.annot", samples)
#' add.gdsn(gds1KG, "sample.ref", c(1,0, 1))
#' sync.gds(gds1KG)
#'
#' ## Create a temporary Profile GDS file
#' fileNameProfile <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_Sample.gds")
#' gdsProfile <- createfn.gds(fileNameProfile)
#'
#' study.list <- data.frame(study.id=c("HTT Study"),
#'     study.desc=c("Important Study"),
#'     study.platform=c("Panel"), stringsAsFactors=FALSE)
#'
#' add.gdsn(gdsProfile, "study.list", study.list)
#'
#' study.annot <- data.frame(data.id=c("TOTO1"), case.id=c("TOTO1"),
#'                 sample.type=c("Study"), diagnosis=c("Study"),
#'                 source=rep("IGSR"), study.id=c("Study"),
#'                 stringsAsFactors=FALSE)
#'
#' add.gdsn(gdsProfile, "study.annot", study.annot)
#' sync.gds(gdsProfile)
#' closefn.gds(gdsProfile)
#'
#' ## Append information about the 1KG samples into the Profile GDS file
#' ## The Profile GDS file will contain 'study.list' and 'study.annot' entries
#' addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileNameProfile,
#'     verbose=TRUE)
#'
#' closefn.gds(gds1KG)
#' unlink(fileNameProfile, recursive=TRUE, force=TRUE)
#' unlink(fileName1KG, recursive=TRUE, force=TRUE)
#' unlink(tempDir)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @export
addStudy1Kg <- function(gdsReference, fileProfileGDS, verbose=FALSE) {

    ## Validate parameters
    validateAddStudy1Kg(gdsReference=gdsReference,
        fileProfileGDS=fileProfileGDS, verbose=verbose)

    ## Open GDS Sample file
    gdsSample <- openfn.gds(filename=fileProfileGDS, readonly=FALSE)

    ## Extract study information from GDS Sample file
    snp.study <- read.gdsn(index.gdsn(node=gdsSample, "study.list"))

    ## When the 1KG Study is not already present in the GDS Sample file
    if(length(which(snp.study$study.id == "Ref.1KG")) == 0) {

        ## Extract information about all samples from 1KG that are unrelated
        ## and can be used in the ancestry analysis
        sample.ref <- read.gdsn(index.gdsn(node=gdsReference, "sample.ref"))
        sample.id <- read.gdsn(index.gdsn(node=gdsReference,
                                        "sample.id"))[which(sample.ref == 1)]

        ## Create study information for the 1KG Study
        study.list <- data.frame(study.id="Ref.1KG",
            study.desc="Unrelated samples from 1000 Genomes",
            study.platform="GRCh38 1000 genotypes", stringsAsFactors=FALSE)

        ## Create the pedigree information  for the 1KG samples
        ped1KG <- data.frame(Name.ID=sample.id, Case.ID=sample.id,
            Sample.Type=rep("Reference", length(sample.id)),
            Diagnosis=rep("Reference", length(sample.id)),
            Source=rep("IGSR", length(sample.id)), stringsAsFactors=FALSE)

        ## Row names must be the sample identifiers
        rownames(ped1KG) <- ped1KG$Name.ID

        ## Add the information about the 1KG samples into the Profile GDS
        addStudyGDSSample(gdsProfile=gdsSample, pedProfile=ped1KG, batch=1,
                    listSamples=NULL, studyDF=study.list, verbose=verbose)

        sync.gds(gdsSample)
    }

    ## Close GDS Sample file (important)
    closefn.gds(gdsSample)

    ## Return success
    return(0L)
}


#' @title Project synthetic profiles onto existing principal component axes
#' generated using the reference 1KG profiles
#'
#' @description The function projects  the synthetic profiles onto existing
#' principal component axes generated using the reference 1KG profiles. The
#' reference profiles used to generate the synthetic profiles have previously
#' been removed from the set of reference profiles.
#'
#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS file),
#' an opened Profile GDS file.
#'
#' @param listPCA a \code{list} containing the PCA \code{object} generated
#' with the 1KG reference profiles (excluding the ones used to generate the
#' synthetic data set) in an entry called \code{"pca.unrel"}.
#'
#' @param sampleRef a \code{vector} of \code{character} strings representing
#' the identifiers of the 1KG reference profiles that have been used to
#' generate the synthetic profiles
#' that are going to be analysed here. The sub-continental
#' identifiers are used as names for the \code{vector}.
#'
#' @param studyIDSyn a \code{character} string corresponding to the study
#' identifier.
#' The study identifier must be present in the Profile GDS file.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return a \code{list} containing 3 entries:
#' \describe{
#' \item{sample.id}{ a \code{vector} of \code{character} strings representing
#' the identifiers of the synthetic profiles that have been projected onto
#' the 1KG PCA. }
#' \item{eigenvector.ref}{ a \code{matrix} of \code{numeric} with the
#' eigenvectors of the 1KG reference profiles used to generate the PCA.}
#' \item{eigenvector}{ a \code{matrix} of \code{numeric} with the
#' eigenvectors of the synthetic profiles projected onto the 1KG PCA. }
#' }
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Loading demo PCA on subset of 1KG reference dataset
#' data(demoPCA1KG)
#'
#' ## Path to the demo Profile GDS file is located in this package
#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS")
#'
#' # The name of the synthetic study
#' studyID <- "MYDATA.Synthetic"
#'
#' samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165",
#'     "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013",
#'     "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789",
#'     "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712",
#'     "NA19731", "NA20528", "NA20908")
#' names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS",
#'     "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU",
#'     "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH")
#'
#' ## Open the Profile GDS file
#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))
#'
#' ## Projects synthetic profiles on 1KG PCA
#' results <- computePCAMultiSynthetic(gdsProfile=gdsProfile,
#'     listPCA=demoPCA1KG,
#'     sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE)
#'
#' ## The eigenvectors for the synthetic profiles
#' head(results$eigenvector)
#'
#' ## Close Profile GDS file (important)
#' closefn.gds(gdsProfile)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt read.gdsn index.gdsn
#' @importFrom SNPRelate snpgdsPCASNPLoading snpgdsPCASampLoading
#' @encoding UTF-8
#' @export
computePCAMultiSynthetic <- function(gdsProfile, listPCA,
                                sampleRef, studyIDSyn, verbose=FALSE) {

    ## Validate the input parameters
    validateComputePCAMultiSynthetic(gdsProfile=gdsProfile,
        listPCA=listPCA, sampleRef=sampleRef, studyIDSyn=studyIDSyn,
        verbose=verbose)

    ## Identify profiles from synthetic data set
    study.annot <- read.gdsn(index.gdsn(gdsProfile, "study.annot"))
    study.annot <- study.annot[which(study.annot$study.id == studyIDSyn &
                                        study.annot$case.id %in% sampleRef),]

    ## SNP loading in principal component analysis
    listPCA[["snp.load"]] <- snpgdsPCASNPLoading(listPCA[["pca.unrel"]],
                                gdsobj=gdsProfile, num.thread=1,
                                verbose=verbose)

    ## Project synthetic profiles onto existing principal component axes
    listPCA[["samp.load"]] <- snpgdsPCASampLoading(listPCA[["snp.load"]],
                                gdsobj=gdsProfile,
                                sample.id=study.annot$data.id,
                                num.thread=1L, verbose=verbose)

    rownames(listPCA[["pca.unrel"]]$eigenvect) <-
                                            listPCA[["pca.unrel"]]$sample.id

    rownames(listPCA[["samp.load"]]$eigenvect) <-
                                            listPCA[["samp.load"]]$sample.id

    ## Return the eigenvectors for the 1KG reference profiles
    ## and the eigenvectors for the synthetic data set projected on the 1KG PCA
    listRes <- list(sample.id=listPCA[["samp.load"]]$sample.id,
                        eigenvector.ref=listPCA[["pca.unrel"]]$eigenvect,
                        eigenvector=listPCA[["samp.load"]]$eigenvect)

    return(listRes)
}


#' @title Project specified profile onto PCA axes generated using known
#' reference profiles
#'
#' @description This function generates a PCA using the know reference
#' profiles. Them, it projects the specified profile onto the PCA axes.
#'
#' @param gdsProfile an object of class \link[gdsfmt]{gds.class},
#' an opened Profile GDS file.
#'
#' @param currentProfile a single \code{character} string representing
#' the profile identifier.
#'
#' @param studyIDRef a single \code{character} string representing the
#' study identifier.
#'
#' @param np a single positive \code{integer} representing the number of CPU
#' that will be used. Default: \code{1L}.
#'
#' @param algorithm a \code{character} string representing the algorithm used
#' to calculate the PCA. The 2 choices are "exact" (traditional exact
#' calculation) and "randomized" (fast PCA with randomized algorithm
#' introduced in Galinsky et al. 2016). Default: \code{"exact"}.
#'
#' @param eigenCount a single \code{integer} indicating the number of
#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA}
#' function; if 'eigen.cnt' <= 0, then all eigenvectors are returned.
#' Default: \code{32L}.
#'
#' @param missingRate a \code{numeric} value representing the threshold
#' missing rate at with the SNVs are discarded; the SNVs are retained in the
#' \link[SNPRelate]{snpgdsPCA}
#' with "<= missingRate" only; if \code{NaN}, no missing threshold.
#' Default: \code{NaN}.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @return a \code{list} containing 3 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the analyzed profile.}
#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric}
#' representing the eigenvectors of the reference profiles. }
#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} representing
#' the eigenvectors of the analyzed profile. }
#' }
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Path to the demo Profile GDS file is located in this package
#' dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS")
#'
#' ## Open the Profile GDS file
#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))
#'
#' ## Project a profile onto a PCA generated using reference profiles
#' ## The reference profiles come from 1KG
#' resPCA <- computePCARefSample(gdsProfile=gdsProfile,
#'     currentProfile=c("ex1"), studyIDRef="Ref.1KG", np=1L, verbose=FALSE)
#' resPCA$sample.id
#' resPCA$eigenvector
#'
#' ## Close the GDS files (important)
#' closefn.gds(gdsProfile)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt read.gdsn index.gdsn
#' @importFrom SNPRelate snpgdsPCA snpgdsPCASNPLoading snpgdsPCASampLoading
#' @encoding UTF-8
#' @export
computePCARefSample <- function(gdsProfile, currentProfile,
                            studyIDRef="Ref.1KG",
                            np=1L, algorithm=c("exact","randomized"),
                            eigenCount=32L, missingRate=NaN, verbose=FALSE) {

    ## Validate parameters
    validateComputePCARefSample(gdsProfile=gdsProfile,
        currentProfile=currentProfile, studyIDRef=studyIDRef, np=np,
        algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate,
        verbose=verbose)

    ## Set algorithm
    algorithm <- match.arg(algorithm)

    sampleID <- read.gdsn(index.gdsn(gdsProfile, "sample.id"))

    samplePos <- which(sampleID == currentProfile)

    studyAnnotAll <- read.gdsn(index.gdsn(gdsProfile, "study.annot"))

    sampleUnrel <- studyAnnotAll[which(studyAnnotAll$study.id ==
                                                studyIDRef), "data.id"]

    listPCA <- list()

    listPCA[["pruned"]] <- read.gdsn(index.gdsn(gdsProfile, "pruned.study"))

    ## Calculate the eigenvectors and eigenvalues for PCA done with the
    ## reference profiles
    listPCA[["pca.unrel"]] <- snpgdsPCA(gdsobj=gdsProfile,
            sample.id=sampleUnrel, snp.id=listPCA[["pruned"]], num.thread=np,
            algorithm=algorithm, eigen.cnt=eigenCount,
            missing.rate=missingRate, verbose=verbose)

    ## calculate the SNP loadings in PCA
    listPCA[["snp.load"]] <- snpgdsPCASNPLoading(listPCA[["pca.unrel"]],
            gdsobj=gdsProfile, num.thread=np, verbose=verbose)

    ## Project specified profile onto the PCA axes
    listPCA[["samp.load"]] <- snpgdsPCASampLoading(listPCA[["snp.load"]],
            gdsobj=gdsProfile, sample.id=sampleID[samplePos],
                                num.thread=np, verbose=verbose)

    rownames(listPCA[["pca.unrel"]]$eigenvect) <-
                                        listPCA[["pca.unrel"]]$sample.id
    rownames(listPCA[["samp.load"]]$eigenvect) <-
                                        listPCA[["samp.load"]]$sample.id

    listRes <- list(sample.id=sampleID[samplePos],
                        eigenvector.ref=listPCA[["pca.unrel"]]$eigenvect,
                        eigenvector=listPCA[["samp.load"]]$eigenvect)

    return(listRes)
}


#' @title Run a k-nearest neighbors analysis on a subset of the
#' synthetic dataset
#'
#' @description The function runs k-nearest neighbors analysis on a
#' subset of the synthetic data set. The function uses the 'knn' package.
#'
#' @param gdsProfile an object of class
#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, the
#' opened Profile GDS file.
#'
#' @param listEigenvector a \code{list} with 3 entries:
#' 'sample.id', 'eigenvector.ref' and 'eigenvector'. The \code{list} represents
#' the PCA done on the 1KG reference profiles and the synthetic profiles
#' projected onto it.
#'
#' @param listCatPop a \code{vector} of \code{character} string
#' representing the list of possible ancestry assignations. Default:
#' \code{c("EAS", "EUR", "AFR", "AMR", "SAS")}.
#'
#' @param studyIDSyn a \code{character} string corresponding to the study
#' identifier.
#' The study identifier must be present in the Profile GDS file.
#'
#' @param spRef \code{vector} of \code{character} strings representing the
#' known super population ancestry for the 1KG profiles. The 1KG profile
#' identifiers are used as names for the \code{vector}.
#'
#' @param fieldPopInfAnc a \code{character} string representing the name of
#' the column that will contain the inferred ancestry for the specified
#' data set. Default: \code{"SuperPop"}.
#'
#' @param kList  a \code{vector} of \code{integer} representing  the list of
#' values tested for the  K parameter. The K parameter represents the
#' number of neighbors used in the K-nearest neighbors analysis. If
#' \code{NULL}, the value \code{seq(2, 15, 1)} is assigned.
#' Default: \code{seq(2, 15, 1)}.
#'
#' @param pcaList a \code{vector} of \code{integer} representing  the list of
#' values tested for the  D parameter. The D parameter represents the
#' number of dimensions used in the PCA analysis.  If \code{NULL},
#' the value \code{seq(2, 15, 1)} is assigned.
#' Default: \code{seq(2, 15, 1)}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{vector} of \code{character} strings
#' representing the identifiers of the synthetic profiles analysed.}
#' \item{\code{sample1Kg}}{ a \code{vector} of \code{character} strings
#' representing the identifiers of the 1KG reference profiles used to
#' generate the synthetic profiles.}
#' \item{\code{sp}}{ a \code{vector} of \code{character} strings representing
#' the known super population ancestry of the 1KG reference profiles used
#' to generate the synthetic profiles.}
#' \item{\code{matKNN}}{ a \code{data.frame} containing the super population
#' inference for each synthetic profiles for different values of PCA
#' dimensions \code{D} and k-neighbors values \code{K}. The fourth column title
#' corresponds to the \code{fieldPopInfAnc} parameter.
#' The \code{data.frame} contains 4 columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing
#' the identifier of the synthetic profile analysed.}
#' \item{\code{D}}{ a \code{numeric} strings representing
#' the value of the PCA dimension used to infer the super population.}
#' \item{\code{K}}{ a \code{numeric} strings representing
#' the value of the k-neighbors used to infer the super population.}
#' \item{\code{fieldPopInfAnc} value}{ a \code{character} string representing
#' the inferred ancestry.}
#' }
#' }
#' }
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Load the demo PCA on the synthetic profiles projected on the
#' ## demo 1KG reference PCA
#' data(demoPCASyntheticProfiles)
#'
#' ## Load the known ancestry for the demo 1KG reference profiles
#' data(demoKnownSuperPop1KG)
#'
#' ## Path to the demo Profile GDS file is located in this package
#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS")
#'
#' ## Open the Profile GDS file
#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))
#'
#' # The name of the synthetic study
#' studyID <- "MYDATA.Synthetic"
#'
#' ## Projects synthetic profiles on 1KG PCA
#' results <- computeKNNRefSynthetic(gdsProfile=gdsProfile,
#'     listEigenvector=demoPCASyntheticProfiles,
#'     listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID,
#'     spRef=demoKnownSuperPop1KG)
#'
#' ## The inferred ancestry for the synthetic profiles for different values
#' ## of D and K
#' head(results$matKNN)
#'
#' ## Close Profile GDS file (important)
#' closefn.gds(gdsProfile)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt read.gdsn index.gdsn
#' @importFrom class knn
#' @encoding UTF-8
#' @export
computeKNNRefSynthetic <- function(gdsProfile, listEigenvector,
                listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"),
                studyIDSyn, spRef, fieldPopInfAnc="SuperPop",
                kList=seq(2, 15, 1), pcaList=seq(2, 15, 1)) {

    ## Assign default value if kList is NULL
    if(is.null(kList)) {
        kList <- seq(2, 15, 1)
    }

    ## Assign default value if pcaList is NULL
    if(is.null(pcaList)) {
        pcaList <- seq(2, 15, 1)
    }

    ## Validate the input parameters
    validateComputeKNNRefSynthetic(gdsProfile=gdsProfile,
        listEigenvector=listEigenvector,
        listCatPop=listCatPop, studyIDSyn=studyIDSyn, spRef=spRef,
        fieldPopInfAnc=fieldPopInfAnc, kList=kList, pcaList=pcaList)

    ## Get study information from the GDS Sample file
    studyAnnotAll <- read.gdsn(index.gdsn(gdsProfile, "study.annot"))

    studyAnnot <- studyAnnotAll[which(studyAnnotAll$study.id ==
                        studyIDSyn & studyAnnotAll$data.id %in%
                                                listEigenvector$sample.id), ]

    listMat <- list()
    for(i in seq_len(length(listEigenvector$sample.id))){
        resMat <- data.frame(sample.id=rep(listEigenvector$sample.id[i],
                                            length(pcaList) * length(kList)),
                                D=rep(0,length(pcaList) * length(kList)),
                                K=rep(0,length(pcaList) * length(kList)),
                                stringsAsFactors=FALSE)
        resMat[[fieldPopInfAnc]] <- character(length(pcaList) * length(kList))

        eigenvect <- rbind(listEigenvector$eigenvector.ref,
                                listEigenvector$eigenvector[i,,drop=FALSE])

        totR <- 1
        for (pcaD in pcaList) {
            for(kV in  seq_len(length(kList))) {
                dCur <- paste0("d", pcaD)
                kCur <- paste0("k", kList[kV])
                resMat[totR,c("D", "K")] <- c(pcaD, kList[kV])

                pcaND <- eigenvect[ ,seq_len(pcaD)]
                yPred <-
                    knn(train=pcaND[rownames(eigenvect)[-1*nrow(eigenvect)],],
                    test=pcaND[rownames(eigenvect)[nrow(eigenvect)],,
                                                                drop=FALSE],
                    cl=factor(spRef[rownames(eigenvect)[-1*nrow(eigenvect)]],
                                        levels=listCatPop, labels=listCatPop),
                    k=kList[kV],
                    prob=FALSE)

                resMat[totR, fieldPopInfAnc] <- listCatPop[as.integer(yPred)]

                totR <- totR + 1
            } # end k
        } # end pca Dim
        listMat[[i]] <- resMat
    }
    resMat <- do.call(rbind, listMat)

    listKNN <- list(sample.id=listEigenvector$sample.id,
                    sample1Kg=studyAnnot$case.id,
                    sp=spRef[studyAnnot$case.id], matKNN=resMat)

    return(listKNN)
}


#' @title Run a k-nearest neighbors analysis on one specific profile
#'
#' @description The function runs k-nearest neighbors analysis on a
#' one specific profile. The function uses the 'knn' package.
#'
#' @param listEigenvector a \code{list} with 3 entries:
#' 'sample.id', 'eigenvector.ref' and 'eigenvector'. The \code{list} represents
#' the PCA done on the 1KG reference profiles and one specific profile
#' projected onto it. The 'sample.id' entry must contain only one identifier
#' (one profile).
#'
#' @param listCatPop a \code{vector} of \code{character} string
#' representing the list of possible ancestry assignations. Default:
#' \code{c("EAS", "EUR", "AFR", "AMR", "SAS")}.
#'
#' @param spRef \code{vector} of \code{character} strings representing the
#' known super population ancestry for the 1KG profiles. The 1KG profile
#' identifiers are used as names for the \code{vector}.
#'
#' @param fieldPopInfAnc a \code{character} string representing the name of
#' the column that will contain the inferred ancestry for the specified
#' profile. Default: \code{"SuperPop"}.
#'
#' @param kList a \code{vector} of \code{integer} representing  the list of
#' values tested for the _K_ parameter. The _K_ parameter represents the
#' number of neighbors used in the K-nearest neighbor analysis. If \code{NULL},
#' the value \code{seq(2,15,1)} is assigned.
#' Default: \code{seq(2,15,1)}.
#'
#' @param pcaList a \code{vector} of \code{integer} representing  the list of
#' values tested for the _D_ parameter. The D parameter represents the
#' number of dimensions used in the PCA analysis.  If \code{NULL},
#' the value \code{seq(2, 15, 1)} is assigned.
#' Default: \code{seq(2, 15, 1)}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{vector} of \code{character} strings
#' representing the identifier of the profile analysed.}
#' \item{\code{matKNN}}{ a \code{data.frame} containing the super population
#' inference for the profile for different values of PCA
#' dimensions \code{D} and k-neighbors values \code{K}. The fourth column title
#' corresponds to the \code{fieldPopInfAnc} parameter.
#' The \code{data.frame} contains 4 columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing
#' the identifier of the profile analysed.}
#' \item{\code{D}}{ a \code{numeric} strings representing
#' the value of the PCA dimension used to infer the ancestry.}
#' \item{\code{K}}{ a \code{numeric} strings representing
#' the value of the k-neighbors used to infer the ancestry..}
#' \item{\code{fieldPopInfAnc}}{ a \code{character} string representing
#' the inferred ancestry.}
#' }
#' }
#' }
#'
#' @examples
#'
#' ## Load the demo PCA on the synthetic profiles projected on the
#' ## demo 1KG reference PCA
#' data(demoPCASyntheticProfiles)
#'
#' ## Load the known ancestry for the demo 1KG reference profiles
#' data(demoKnownSuperPop1KG)
#'
#' ## The PCA with 1 profile projected on the 1KG reference PCA
#' ## Only one profile is retained
#' pca <- demoPCASyntheticProfiles
#' pca$sample.id <- pca$sample.id[1]
#' pca$eigenvector <- pca$eigenvector[1, , drop=FALSE]
#'
#' ## Projects profile on 1KG PCA
#' results <- computeKNNRefSample(listEigenvector=pca,
#'     listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"),
#'     spRef=demoKnownSuperPop1KG, fieldPopInfAnc="SuperPop",
#'     kList=seq(10, 15, 1), pcaList=seq(10, 15, 1))
#'
#' ## The assigned ancestry to the profile for different values of K and D
#' head(results$matKNN)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom class knn
#' @encoding UTF-8
#' @export
computeKNNRefSample <- function(listEigenvector,
                            listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"),
                            spRef, fieldPopInfAnc="SuperPop",
                            kList=seq(2, 15, 1), pcaList=seq(2, 15, 1)) {

    if(is.null(kList)){
        kList <- seq(2, 15, 1)
    }
    if(is.null(pcaList)){
        pcaList <- seq(2, 15, 1)
    }

    ## Validate the input parameters
    validateComputeKNNRefSample(listEigenvector=listEigenvector,
        listCatPop=listCatPop, spRef=spRef, fieldPopInfAnc=fieldPopInfAnc,
        kList=kList, pcaList=pcaList)

    resMat <- data.frame(sample.id=rep(listEigenvector$sample.id,
                                        length(pcaList) * length(kList)),
                            D=rep(0,length(pcaList) * length(kList)),
                            K=rep(0,length(pcaList) * length(kList)),
                            stringsAsFactors=FALSE)
    resMat[[fieldPopInfAnc]] <- character(length(pcaList) * length(kList))

    eigenvect <- rbind(listEigenvector$eigenvector.ref,
                            listEigenvector$eigenvector)

    totR <- 1
    for(pcaD in pcaList) {
        for(kV in  seq_len(length(kList))) {
            dCur <- paste0("d", pcaD)
            kCur <- paste0("k", kList[kV])
            resMat[totR,c("D", "K")] <- c(pcaD, kList[kV])

            pcaND <- eigenvect[ ,seq_len(pcaD)]
            yPred <- knn(train=pcaND[rownames(eigenvect)[-1*nrow(eigenvect)],],
                    test=pcaND[rownames(eigenvect)[nrow(eigenvect)],,
                                drop=FALSE],
                    cl=factor(spRef[rownames(eigenvect)[-1*nrow(eigenvect)]],
                                    levels=listCatPop, labels=listCatPop),
                    k=kList[kV],
                    prob=FALSE)

            resMat[totR, fieldPopInfAnc] <- listCatPop[as.integer(yPred)]

            totR <- totR + 1
        } # end k
    } # end pca Dim

    listKNN <- list(sample.id=listEigenvector$sample.id, matKNN=resMat)

    return(listKNN)
}


#' @title Run a PCA analysis and a K-nearest neighbors analysis on a small set
#' of synthetic data using all 1KG profiles except the ones used to generate
#' the synthetic profiles
#'
#' @description The function runs a PCA analysis using 1 synthetic profile
#' from each sub-continental population. The reference profiles used to
#' create those synthetic profiles are first removed from the list
#' of 1KG reference profiles that generates the reference PCA. Then, the
#' retained synthetic
#' profiles are projected on the 1KG PCA space. Finally, a K-nearest neighbors
#' analysis using a range of K and D values is done.
#'
#' @param gdsProfile an object of class
#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, the
#' opened Profile GDS file.
#'
#' @param sampleRM a \code{vector} of \code{character} strings representing
#' the identifiers of the 1KG reference profiles that should not be used to
#' create the reference PCA. There should be one per sub-continental
#' population. Those profiles are
#' removed because those have been used to generate the synthetic profiles
#' that are going to be analysed here. The sub-continental
#' identifiers are used as names for the \code{vector}.
#'
#' @param spRef \code{vector} of \code{character} strings representing the
#' known super population ancestry for the 1KG profiles. The 1KG profile
#' identifiers are used as names for the \code{vector}.
#'
#' @param studyIDSyn a \code{character} string corresponding to the study
#' identifier.
#' The study identifier must be present in the Profile GDS file.
#'
#' @param np a single positive \code{integer} representing the number of
#' threads. Default: \code{1L}.
#'
#' @param listCatPop a \code{vector} of \code{character} string
#' representing the list of possible ancestry assignations. Default:
#' \code{("EAS", "EUR", "AFR", "AMR", "SAS")}.
#'
#' @param fieldPopInfAnc a \code{character} string representing the name of
#' the column that will contain the inferred ancestry for the specified
#' dataset. Default: \code{"SuperPop"}.
#'
#' @param kList a \code{vector} of \code{integer} representing  the list of
#' values tested for the  _K_ parameter. The _K_ parameter represents the
#' number of neighbors used in the K-nearest neighbor analysis. If \code{NULL},
#' the value \code{seq(2,15,1)} is assigned.
#' Default: \code{seq(2,15,1)}.
#'
#' @param pcaList a \code{vector} of \code{integer} representing  the list of
#' values tested for the  _D_ parameter. The _D_ parameter represents the
#' number of dimensions used in the PCA analysis.  If \code{NULL},
#' the value \code{seq(2,15,1)} is assigned.
#' Default: \code{seq(2,15,1)}.
#'
#' @param algorithm a \code{character} string representing the algorithm used
#' to calculate the PCA. The 2 choices are "exact" (traditional exact
#' calculation) and "randomized" (fast PCA with randomized algorithm
#' introduced in Galinsky et al. 2016). Default: \code{"exact"}.
#'
#' @param eigenCount a single \code{integer} indicating the number of
#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA}
#' function; if 'eigenCount' <= 0, then all eigenvectors are returned.
#' Default: \code{32L}.
#'
#' @param missingRate a \code{numeric} value representing the threshold
#' missing rate at with the SNVs are discarded; the SNVs are retained in the
#' \link[SNPRelate]{snpgdsPCA} function
#' with "<= missingRate" only; if \code{NaN}, no missing threshold.
#' Default: \code{0.025}.
#'
#' @param verbose a \code{logical} indicating if message information should be
#' printed. Default: \code{FALSE}.
#'
#' @return a \code{list} containing the following entries:
#' \describe{
#' \item{sample.id}{ a \code{vector} of \code{character} strings representing
#' the identifiers of the synthetic profiles. }
#' \item{sample1Kg}{ a \code{vector} of \code{character} strings representing
#' the identifiers of the reference 1KG profiles used to generate the
#' synthetic profiles. }
#' \item{sp}{ a \code{vector} of \code{character} strings representing the
#' known ancestry for the reference 1KG profiles used to generate the
#' synthetic profiles. }
#' \item{matKNN}{ a \code{data.frame} containing 4 columns. The first column
#' 'sample.id' contains the name of the synthetic profile. The second column
#' 'D' represents the dimension D used to infer the ancestry. The third column
#' 'K' represents the number of neighbors K used to infer the ancestry. The
#' fourth column 'SuperPop' contains the inferred ancestry. }
#' }
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Load the known ancestry for the demo 1KG reference profiles
#' data(demoKnownSuperPop1KG)
#'
#'
#' # The name of the synthetic study
#' studyID <- "MYDATA.Synthetic"
#'
#' samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165",
#'     "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013",
#'     "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789",
#'     "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712",
#'     "NA19731", "NA20528", "NA20908")
#' names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS",
#'     "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU",
#'     "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH")
#'
#' ## Path to the demo Profile GDS file is located in this package
#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS")
#'
#' ## Open the Profile GDS file
#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))
#'
#' ## Run a PCA analysis and a K-nearest neighbors analysis on a small set
#' ## of synthetic data
#' results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile,
#'     sampleRM=samplesRM, studyIDSyn=studyID, np=1L,
#'     spRef=demoKnownSuperPop1KG,
#'     kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L)
#'
#' ## The ancestry inference for the synthetic data using
#' ## different K and D values
#' head(results$matKNN)
#'
#' ## Close Profile GDS file (important)
#' closefn.gds(gdsProfile)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
computePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, spRef,
                            studyIDSyn, np=1L,
                            listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"),
                            fieldPopInfAnc="SuperPop",
                            kList=seq(2,15,1),
                            pcaList=seq(2,15,1),
                            algorithm=c("exact", "randomized"),
                            eigenCount=32L, missingRate=0.025, verbose=FALSE) {

    ## Assign default value is kList is NULL
    if(is.null(kList)) {
        kList <- seq(2,15,1)
    }

    ## Assign default value is pcaList is NULL
    if(is.null(pcaList)) {
        pcaList <- seq(2,15,1)
    }

    ## Validate the input parameters
    validateComputePoolSyntheticAncestryGr(gdsProfile=gdsProfile,
        sampleRM=sampleRM, spRef=spRef, studyIDSyn=studyIDSyn, np=np,
        listCatPop=listCatPop, pcaList=pcaList,
        fieldPopInfAnc=fieldPopInfAnc, kList=kList,
        algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate,
        verbose=verbose)

    ## Set algorithm
    algorithm <- arg_match(algorithm)

    ## Calculate Principal Component Analysis (PCA) on SNV genotype dataset
    ## excluded the selected profiles used to generate the synthetic profiles
    pca1KG <- computePCARefRMMulti(gdsProfile=gdsProfile,
        refProfileIDs=names(spRef), listRM=sampleRM, np=np, algorithm=algorithm,
        eigenCount=eigenCount, missingRate=missingRate, verbose=verbose)

    ## Calculate PCA on the synthetic profiles using 1KG PCA results
    resPCA <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=pca1KG,
                sampleRef=sampleRM, studyIDSyn=studyIDSyn, verbose=verbose)

    ## Calculate the k-nearest neighbor analyses on a subset of the
    ## synthetic data set
    synthKNN <- computeKNNRefSynthetic(gdsProfile=gdsProfile,
        listEigenvector=resPCA,
        listCatPop=listCatPop, studyIDSyn=studyIDSyn, spRef=spRef,
        fieldPopInfAnc=fieldPopInfAnc, kList=kList, pcaList=pcaList)

    return(synthKNN)
}



#' @title Select the optimal K and D parameters using the synthetic data and
#' infer the ancestry of a specific profile
#'
#' @description The function select the optimal K and D parameters for a
#' specific profile. The results on the synthetic data are used for the
#' parameter selection. Once the optimal parameters are selected, the
#' ancestry is inferred for the specific profile.
#'
#' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS
#' file), the opened 1KG GDS file.
#'
#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}}
#' (a GDS file), the opened Profile GDS file.
#'
#' @param listFiles a \code{vector} of \code{character} strings representing
#' the name of files that contain the results of ancestry inference done on
#' the synthetic profiles for multiple values of _D_ and _K_. The files must
#' exist.
#'
#' @param currentProfile a \code{character} string representing the profile
#' identifier of the current profile on which ancestry will be inferred.
#'
#' @param spRef a \code{vector} of \code{character} strings representing the
#' known super population ancestry for the 1KG profiles. The 1KG profile
#' identifiers are used as names for the \code{vector}.
#'
#' @param studyIDSyn a \code{character} string corresponding to the study
#' identifier. The study identifier must be present in the GDS Sample file.
#'
#' @param np a single positive \code{integer} representing the number of
#' threads. Default: \code{1L}.
#'
#' @param listCatPop a \code{vector} of \code{character} string
#' representing the list of possible ancestry assignations. Default:
#' \code{("EAS", "EUR", "AFR", "AMR", "SAS")}.
#'
#' @param fieldPopIn1KG a \code{character} string representing the name of the
#' column that contains the known ancestry for the reference profiles in
#' the Reference GDS file.
#'
#' @param fieldPopInfAnc a \code{character} string representing the name of
#' the column that will contain the inferred ancestry for the specified
#' profiles. Default: \code{"SuperPop"}.
#'
#' @param kList a \code{vector} of \code{integer} representing  the list of
#' values tested for the  _K_ parameter. The _K_ parameter represents the
#' number of neighbors used in the K-nearest neighbor analysis. If \code{NULL},
#' the value \code{seq(2,15,1)} is assigned.
#' Default: \code{seq(2,15,1)}.
#'
#' @param pcaList a \code{vector} of \code{integer} representing  the list of
#' values tested for the  _D_ parameter. The _D_ parameter represents the
#' number of dimensions used in the PCA analysis.  If \code{NULL},
#' the value \code{seq(2,15,1)} is assigned.
#' Default: \code{seq(2,15,1)}.
#'
#' @param algorithm a \code{character} string representing the algorithm used
#' to calculate the PCA. The 2 choices are "exact" (traditional exact
#' calculation) and "randomized" (fast PCA with randomized algorithm
#' introduced in Galinsky et al. 2016). Default: \code{"exact"}.
#'
#' @param eigenCount a single \code{integer} indicating the number of
#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA}
#' function; if 'eigenCount' <= 0, then all eigenvectors are returned.
#' Default: \code{32L}.
#'
#' @param missingRate a \code{numeric} value representing the threshold
#' missing rate at with the SNVs are discarded; the SNVs are retained in the
#' \link[SNPRelate]{snpgdsPCA}
#' with "<= missingRate" only; if \code{NaN}, no missing threshold.
#' Default: \code{NaN}.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{pcaSample}}{ a \code{list} containing the information related
#' to the eigenvectors. The \code{list} contains those 3 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
#' the eigenvectors for the reference profiles.}
#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
#' eigenvectors for the current profile projected on the PCA from the
#' reference profiles.}
#' }
#' }
#' \item{\code{paraSample}}{ a \code{list} containing the results with
#' different \code{D} and \code{K} values that lead to optimal parameter
#' selection. The \code{list} contains those entries:
#' \describe{
#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
#' on all combined synthetic results done with a fixed value of \code{D} (the
#' number of dimensions). The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{median}}{ a \code{numeric} representing the median of the
#' minimum AUROC obtained (within super populations) for all combination of
#' the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
#' AUROC obtained (within super populations) for all combination of the fixed
#' \code{D} value and all tested \code{K} values. }
#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
#' of the minimum AUROC obtained (within super populations) for all
#' combination of the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for a fixed \code{D} value. }
#' }
#' }
#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
#' all combined synthetic results done with different values of \code{D} (the
#' number of dimensions) and \code{K} (the number of neighbors).
#' The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
#' obtained by grouping all the synthetic results by super-populations, for
#' the specified values of \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
#' by grouping all the synthetic results for the specified values of \code{D}
#' and \code{K}.}
#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
#' of the confusion matrix obtained by grouping all the synthetic results for
#' the specified values of \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
#' super-population. The \code{data.frame} contains
#' those columns:
#' \describe{
#' \item{\code{pcaD}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{Call}}{ a \code{character} string representing the
#' super-population.}
#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' \item{\code{AUR}}{ a \code{numeric} representing  the AUROC obtained for the
#' fixed values of super-population, \code{D} and \code{K}.}
#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
#' (the number of dimensions) for the specific profile.}
#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for the specific profile.}
#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
#' values (the number of dimensions) for the specific profile. More than one
#' \code{D} is possible.}
#' }
#' }
#' \item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry
#' using different \code{D} and \code{K} values. The \code{list} contains
#' those entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry
#' for different values of \code{K} and \code{D}. The \code{data.frame}
#' contains those columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' }
#' }
#' }
#' }
#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
#' ancestry for the current profile. The \code{data.frame} contains those
#' columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry.}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry.}
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry.}
#' }
#' }
#' }
#'
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Load the known ancestry for the demo 1KG reference profiles
#' data(demoKnownSuperPop1KG)
#'
#' ## The Reference GDS file
#' path1KG <- system.file("extdata/tests", package="RAIDS")
#'
#' ## Open the Reference GDS file
#' gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds"))
#'
#' ## Path to the demo synthetic results files
#' ## List of the KNN result files from PCA run on synthetic data
#' dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS")
#' listFilesName <- dir(file.path(dataDirRes), ".rds")
#' listFiles <- file.path(file.path(dataDirRes) , listFilesName)
#'
#' # The name of the synthetic study
#' studyID <- "MYDATA.Synthetic"
#'
#' ## Path to the demo Profile GDS file is located in this package
#' dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS")
#'
#' ## Open the Profile GDS file
#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))
#'
#' ## Run the ancestry inference on one profile called 'ex1'
#' ## The values of K and D used for the inference are selected using the
#' ## synthetic results
#' resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsRef,
#'                             gdsProfile=gdsProfile,
#'                             listFiles=listFiles,
#'                             currentProfile=c("ex1"),
#'                             spRef=demoKnownSuperPop1KG,
#'                             studyIDSyn=studyID, np=1L)
#'
#' ## The ancestry called with the optimal D and K values
#' resCall$Ancestry
#'
#' ## Close the GDS files (important)
#' closefn.gds(gdsProfile)
#' closefn.gds(gdsRef)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile,
                            listFiles, currentProfile, spRef,
                            studyIDSyn, np=1L,
                            listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"),
                            fieldPopIn1KG="superPop",
                            fieldPopInfAnc="SuperPop",
                            kList=seq(2, 15, 1),
                            pcaList=seq(2, 15, 1),
                            algorithm=c("exact", "randomized"),
                            eigenCount=32L,
                            missingRate=NaN, verbose=FALSE) {
    ## Set parameters when no values given by user
    if (is.null(pcaList)) {
        pcaList <- seq(2, 15, 1)
    }

    if (is.null(kList)) {
        kList <- seq(2, 15, 1)
    }

    ## Validate input parameters
    validateComputeAncestryFromSyntheticFile(gdsReference=gdsReference,
        gdsProfile=gdsProfile, listFiles=listFiles,
        currentProfile=currentProfile, spRef=spRef, studyIDSyn=studyIDSyn,
        np=np, listCatPop=listCatPop, fieldPopIn1KG=fieldPopIn1KG,
        fieldPopInfAnc=fieldPopInfAnc, kList=kList, pcaList=pcaList,
        algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate,
        verbose=verbose)

    ## Matches a character method against a table of candidate values
    algorithm <- arg_match(algorithm)

    ## Merge results from PCA run on synthetic data present in RDS files
    KNN.list <- list()
    for(j in seq_len(length(listFiles))) {
        if (file.exists(listFiles[j])) {
            KNN.list[[j]] <- readRDS(listFiles[j])
        } else {
            stop("The file \'", listFiles[j] ,"\' does not exist.")
        }
    }
    resultsKNN <- do.call(rbind, KNN.list)

    ## Extract the super-population information from the 1KG GDS file
    ## for profiles associated to the synthetic study
    pedSyn <- prepPedSynthetic1KG(gdsReference=gdsReference,
        gdsSample=gdsProfile, studyID=studyIDSyn, popName=fieldPopIn1KG)

    ## Compile all the inferred ancestry results for different values of
    ## D and K to select the optimal parameters
    listParaSample <- selParaPCAUpQuartile(matKNN=resultsKNN,
        pedCall=pedSyn, refCall=fieldPopIn1KG, predCall=fieldPopInfAnc,
        listCall=listCatPop)

    ## Project profile on the PCA created with the reference profiles
    listPCAProfile <- computePCARefSample(gdsProfile=gdsProfile,
        currentProfile=currentProfile, studyIDRef="Ref.1KG", np=np,
        algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate,
        verbose=verbose)

    ## Run a k-nearest neighbors analysis on one specific profile
    listKNNSample <- computeKNNRefSample(listEigenvector=listPCAProfile,
        listCatPop=listCatPop, spRef=spRef, fieldPopInfAnc=fieldPopInfAnc,
        kList=kList, pcaList=pcaList)

    ## The ancestry call for the current profile
    resCall <- listKNNSample$matKNN[
        which(listKNNSample$matKNN$D == listParaSample$D &
                        listKNNSample$matKNN$K == listParaSample$K ),]

    res <- list(pcaSample=listPCAProfile, # PCA of the profile + 1KG
                paraSample=listParaSample, # Result of the parameter selection
                KNNSample=listKNNSample, # KNN for the profile
                Ancestry=resCall) # the ancestry call fo the profile

    return(res)
}


#' @title Run most steps leading to the ancestry inference call on a specific
#' exome profile
#'
#' @description This function runs most steps leading to the ancestry inference
#' call on a specific exome profile. First, the function creates the
#' Profile GDS file for the specific profile using the information from a
#' RDS Sample description file and the Population reference GDS file.
#'
#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID",
#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in
#' \code{character} strings (no factor). The \code{data.frame}
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy}
#' can be defined.
#'
#' @param studyDF a \code{data.frame} containing the information about the
#' study associated to the analysed sample(s). The \code{data.frame} must have
#' those 3 columns: "study.id", "study.desc", "study.platform". All columns
#' must be in \code{character} strings (no factor).
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Profile files will be created.
#' Default: \code{NULL}.
#'
#' @param pathGeno a \code{character} string representing the path to the
#' directory containing the VCF output of SNP-pileup for each sample. The
#' SNP-pileup files must be compressed (gz files) and have the name identifiers
#' of the samples. A sample with "Name.ID" identifier would have an
#' associated file called
#' if genoSource is "VCF", then "Name.ID.vcf.gz",
#' if genoSource is "generic", then "Name.ID.generic.txt.gz"
#' if genoSource is "snp-pileup", then "Name.ID.txt.gz".
#'
#' @param pathOut a \code{character} string representing the path to
#' the directory where the output files are created.
#'
#' @param fileReferenceGDS  a \code{character} string representing the file
#' name of the Reference GDS file. The file must exist.
#'
#' @param fileReferenceAnnotGDS a \code{character} string representing the
#' file name of the Population Reference GDS Annotation file. The file must
#' exist.
#'
#' @param chrInfo a \code{vector} of positive \code{integer} values
#' representing the length of the chromosomes. See 'details' section.
#'
#' @param syntheticRefDF a \code{data.frame} containing a subset of
#' reference profiles for each sub-population present in the Reference GDS
#' file. The \code{data.frame} must have those columns:
#' \describe{
#' \item{sample.id}{ a \code{character} string representing the sample
#' identifier. }
#' \item{pop.group}{ a \code{character} string representing the
#' subcontinental population assigned to the sample. }
#' \item{superPop}{ a \code{character} string representing the
#' super-population assigned to the sample. }
#' }
#'
#' @param genoSource a \code{character} string with two possible values:
#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return The integer \code{0L} when successful. See details section for
#' more information about the generated output files.
#'
#' @details
#'
#' The runExomeAncestry() function generates 3 types of files
#' in the OUTPUT directory.
#' \describe{
#' \item{Ancestry Inference}{ The ancestry inference CSV file
#' (".Ancestry.csv" file)}
#' \item{Inference Informaton}{ The inference information RDS file
#' (".infoCall.rds" file)}
#' \item{Synthetic Information}{ The parameter information RDS files
#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)}
#' }
#'
#' In addition, a sub-directory (named using the profile ID) is
#' also created.
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
#' #################################################################
#' ## Load the information about the profile
#' #################################################################
#' data(demoPedigreeEx1)
#' head(demoPedigreeEx1)
#'
#' #################################################################
#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file
#' ## need to be located in the same directory
#' ## Note that the 1KG GDS file used for this example is a
#' ## simplified version and CANNOT be used for any real analysis
#' #################################################################
#' path1KG <- file.path(dataDir, "tests")
#'
#' fileReferenceGDS  <- file.path(path1KG, "ex1_good_small_1KG.gds")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
#'
#' #################################################################
#' ## The Sample SNP pileup files (one per sample) need
#' ## to be located in the same directory.
#' #################################################################
#' pathGeno <- file.path(dataDir, "example", "snpPileup")
#'
#' #################################################################
#' ## The path where the Profile GDS Files (one per sample)
#' ## will be created need to be specified.
#' #################################################################
#' pathProfileGDS <- file.path(tempdir(), "out.tmp")
#'
#' pathOut <- file.path(tempdir(), "res.out")
#'
#' #################################################################
#' ## A data frame containing general information about the study
#' ## is also required. The data frame must have
#' ## those 3 columns: "studyID", "study.desc", "study.platform"
#' #################################################################
#' studyDF <- data.frame(study.id="MYDATA",
#'                         study.desc="Description",
#'                         study.platform="PLATFORM",
#'                         stringsAsFactors=FALSE)
#'
#' ####################################################################
#' ## Fix seed to ensure reproducible results
#' ####################################################################
#' set.seed(3043)
#'
#' gds1KG <- snpgdsOpen(fileReferenceGDS)
#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
#' closefn.gds(gds1KG)
#'
#' ## Required library for this example to run correctly
#' if (requireNamespace("Seqinfo", quietly=TRUE) &&
#'      requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {
#'
#'     ## Chromosome length information
#'     ## chr23 is chrX, chr24 is chrY and chrM is 25
#'     chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]
#'
#'     \donttest{
#'
#'         runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF,
#'             pathProfileGDS=pathProfileGDS,
#'             pathGeno=pathGeno,
#'             pathOut=pathOut,
#'             fileReferenceGDS=fileReferenceGDS,
#'             fileReferenceAnnotGDS=fileAnnotGDS,
#'             chrInfo=chrInfo,
#'             syntheticRefDF=dataRef,
#'             genoSource="snp-pileup")
#'
#'         unlink(pathProfileGDS, recursive=TRUE, force=TRUE)
#'         unlink(pathOut, recursive=TRUE, force=TRUE)
#'     }
#' }
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom utils write.csv
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
runExomeAncestry <- function(pedStudy, studyDF, pathProfileGDS,
                    pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS,
                    chrInfo, syntheticRefDF,
                    genoSource=c("snp-pileup", "generic", "VCF"), np=1L,
                    verbose=FALSE) {

    ## Validate parameters
    validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF,
        pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut,
        fileReferenceGDS=fileReferenceGDS,
        fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo,
        syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose)

    genoSource <- arg_match(genoSource)

    r <- runWrapperAncestry(pedStudy, studyDF, pathProfileGDS,
        pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS,
        chrInfo, syntheticRefDF, genoSource, studyType="DNA", np=np, verbose)

    ## Successful
    return(r)
}

#' @title Run most steps leading to the ancestry inference call on a specific
#' DNA profile
#'
#' @description This function runs most steps leading to the ancestry inference
#' call on a specific RNA profile. First, the function creates the
#' Profile GDS file for the specific profile using the information from a
#' RDS Sample description file and the Population Reference GDS file.
#'
#' @param profileFile a \code{character} string representing the path and the
#' file name of the genotype file or the bam if genoSource is snp-pileup the
#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Profile files will be created.
#' Default: \code{NULL}.
#'
#' @param fileReferenceGDS  a \code{character} string representing the file
#' name of the Population Reference GDS file. The file must exist.
#'
#' @param fileReferenceAnnotGDS a \code{character} string representing the
#' file name of the Population Reference GDS Annotation file. The file
#' must exist.
#'
#' @param chrInfo a \code{vector} of positive \code{integer} values
#' representing the length of the chromosomes. See 'details' section.
#'
#' @param syntheticRefDF a \code{data.frame} containing a subset of
#' reference profiles for each sub-population present in the Reference GDS
#' file. The \code{data.frame} must have those columns:
#' \describe{
#' \item{sample.id}{ a \code{character} string representing the sample
#' identifier. }
#' \item{pop.group}{ a \code{character} string representing the
#' subcontinental population assigned to the sample. }
#' \item{superPop}{ a \code{character} string representing the
#' super-population assigned to the sample. }
#' }
#'
#' @param genoSource a \code{character} string with four possible values:
#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{pcaSample}}{ a \code{list} containing the information related
#' to the eigenvectors. The \code{list} contains those 3 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
#' the eigenvectors for the reference profiles.}
#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
#' eigenvectors for the current profile projected on the PCA from the
#' reference profiles.}
#' }
#' }
#' \item{\code{paraSample}}{ a \code{list} containing the results with
#' different \code{D} and \code{K} values that lead to optimal parameter
#' selection. The \code{list} contains those entries:
#' \describe{
#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
#' on all combined synthetic results done with a fixed value of \code{D} (the
#' number of dimensions). The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{median}}{ a \code{numeric} representing the median of the
#' minimum AUROC obtained (within super populations) for all combination of
#' the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
#' AUROC obtained (within super populations) for all combination of the fixed
#' \code{D} value and all tested \code{K} values. }
#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
#' of the minimum AUROC obtained (within super populations) for all
#' combination of the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for a fixed \code{D} value. }
#' }
#' }
#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
#' all combined synthetic results done with different values of \code{D} (the
#' number of dimensions) and \code{K} (the number of neighbors).
#' The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
#' obtained by grouping all the synthetic results by super-populations, for
#' the specified values of \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
#' by grouping all the synthetic results for the specified values of \code{D}
#' and \code{K}.}
#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
#' of the confusion matrix obtained by grouping all the synthetic results for
#' the specified values of \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
#' super-population. The \code{data.frame} contains
#' those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{Call}}{ a \code{character} string representing the
#' super-population.}
#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing  the AUROC obtained for the
#' fixed values of super-population, \code{D} and \code{K}.}
#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
#' (the number of dimensions) for the specific profile.}
#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for the specific profile.}
#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
#' values (the number of dimensions) for the specific profile. More than one
#' \code{D} is possible.}
#' }
#' }
#' \item{\code{KNNSample}}{  a \code{data.frame} containing the inferred ancestry
#' for different values of \code{K} and \code{D}. The \code{data.frame}
#' contains those columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' }
#' }
#' \item{\code{KNNSynthetic}}{  a \code{data.frame} containing the inferred ancestry
#' for each synthetic data for different values of \code{K} and \code{D}.
#' The \code{data.frame}
#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop"
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current synthetic data.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' \item{\code{ref.superPop}}{ a \code{character} string representing the known
#' ancestry from the reference}
#' }
#' }
#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
#' ancestry for the current profile. The \code{data.frame} contains those
#' columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry.}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry.}
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry.}
#' }
#' }
#' }
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
#' #################################################################
#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file
#' ## need to be located in the same directory
#' ## Note that the 1KG GDS file used for this example is a
#' ## simplified version and CANNOT be used for any real analysis
#' #################################################################
#' path1KG <- file.path(dataDir, "tests")
#'
#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
#'
#' #################################################################
#' ## The Sample SNP pileup files (one per sample) need
#' ## to be located in the same directory.
#' #################################################################
#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz")
#'
#' #################################################################
#' ## The path where the Profile GDS Files (one per sample)
#' ## will be created need to be specified.
#' #################################################################
#' pathProfileGDS <- file.path(tempdir(), "out.tmp")
#'
#' ####################################################################
#' ## Fix seed to ensure reproducible results
#' ####################################################################
#' set.seed(3043)
#'
#' gds1KG <- snpgdsOpen(fileReferenceGDS)
#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
#' closefn.gds(gds1KG)
#'
#' ## Required library for this example to run correctly
#' if (requireNamespace("Seqinfo", quietly=TRUE) &&
#'      requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {
#'
#'     ## Chromosome length information
#'     ## chr23 is chrX, chr24 is chrY and chrM is 25
#'     chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]
#'
#'     \donttest{
#'
#'         res <- inferAncestry(profileFile=demoProfileEx1,
#'             pathProfileGDS=pathProfileGDS,
#'             fileReferenceGDS=fileReferenceGDS,
#'             fileReferenceAnnotGDS=fileAnnotGDS,
#'             chrInfo=chrInfo,
#'             syntheticRefDF=dataRef,
#'             genoSource="snp-pileup")
#'
#'         unlink(pathProfileGDS, recursive=TRUE, force=TRUE)
#'
#'     }
#' }
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom utils write.csv
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
inferAncestry <- function(profileFile, pathProfileGDS,
                    fileReferenceGDS, fileReferenceAnnotGDS,
                    chrInfo, syntheticRefDF,
                    genoSource=c("snp-pileup", "generic", "VCF", "bam"),
                    np=1L, verbose=FALSE) {

    profileBaseName <- basename(profileFile)
    pathGeno <- dirname(profileFile)

    genoSource <- arg_match(genoSource)

    ## BAM format is not yet implemented
    # if (genoSource == "bam") {
    #     stop("The bam is not release yet look to get a \'Devel\' version ",
    #             "or contact us")
    # }

    ## Extract the name of the profile(s)
    profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case=TRUE)
    for (extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")) {
        profileName <- gsub(extCur, "", profileName, ignore.case = TRUE)
    }

    ## Create required data frames
    studyDF <- data.frame(study.id="NotDef", study.desc="NotDef",
                            study.platform="NotDef", stringsAsFactors=FALSE)
    pedStudy <- data.frame(Name.ID=c(profileName), Case.ID=c(profileName),
                            Sample.Type=c("DNA"), Diagnosis="NotDef",
                            Source=c("ENotDef"), stringsAsFactors=FALSE)
    row.names(pedStudy) <- pedStudy$Name.ID

    ## Validate parameters
    validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF,
        pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut="./",
        fileReferenceGDS=fileReferenceGDS,
        fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo,
        syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose)

    ## Run ancestry inference
    if (genoSource %in% c("snp-pileup", "generic", "VCF", "bam")) {
        r <- wrapperAncestry(pedStudy, studyDF, pathProfileGDS,
                profileFile, fileReferenceGDS, fileReferenceAnnotGDS,
                chrInfo, syntheticRefDF, genoSource=genoSource, studyType="LD", np=np,
                verbose=verbose)
    }else{
        stop(paste0("The format ", genoSource," is not implemented yet\n"))
    }

    ## Successful
    return(r)
}

#' @title Run most steps leading to the ancestry inference call on a specific
#' DNA profile (alias for inferAncestry )
#'
#' @description This function runs most steps leading to the ancestry inference
#' call on a specific RNA profile. First, the function creates the
#' Profile GDS file for the specific profile using the information from a
#' RDS Sample description file and the Population Reference GDS file.
#'
#' @param profileFile a \code{character} string representing the path and the
#' file name of the genotype file or the bam if genoSource is snp-pileup the
#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Profile files will be created.
#' Default: \code{NULL}.
#'
#' @param fileReferenceGDS  a \code{character} string representing the file
#' name of the Population Reference GDS file. The file must exist.
#'
#' @param fileReferenceAnnotGDS a \code{character} string representing the
#' file name of the Population Reference GDS Annotation file. The file
#' must exist.
#'
#' @param chrInfo a \code{vector} of positive \code{integer} values
#' representing the length of the chromosomes. See 'details' section.
#'
#' @param syntheticRefDF a \code{data.frame} containing a subset of
#' reference profiles for each sub-population present in the Reference GDS
#' file. The \code{data.frame} must have those columns:
#' \describe{
#' \item{sample.id}{ a \code{character} string representing the sample
#' identifier. }
#' \item{pop.group}{ a \code{character} string representing the
#' subcontinental population assigned to the sample. }
#' \item{superPop}{ a \code{character} string representing the
#' super-population assigned to the sample. }
#' }
#'
#' @param genoSource a \code{character} string with four possible values:
#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{pcaSample}}{ a \code{list} containing the information related
#' to the eigenvectors. The \code{list} contains those 3 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
#' the eigenvectors for the reference profiles.}
#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
#' eigenvectors for the current profile projected on the PCA from the
#' reference profiles.}
#' }
#' }
#' \item{\code{paraSample}}{ a \code{list} containing the results with
#' different \code{D} and \code{K} values that lead to optimal parameter
#' selection. The \code{list} contains those entries:
#' \describe{
#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
#' on all combined synthetic results done with a fixed value of \code{D} (the
#' number of dimensions). The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{median}}{ a \code{numeric} representing the median of the
#' minimum AUROC obtained (within super populations) for all combination of
#' the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
#' AUROC obtained (within super populations) for all combination of the fixed
#' \code{D} value and all tested \code{K} values. }
#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
#' of the minimum AUROC obtained (within super populations) for all
#' combination of the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for a fixed \code{D} value. }
#' }
#' }
#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
#' all combined synthetic results done with different values of \code{D} (the
#' number of dimensions) and \code{K} (the number of neighbors).
#' The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
#' obtained by grouping all the synthetic results by super-populations, for
#' the specified values of \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
#' by grouping all the synthetic results for the specified values of \code{D}
#' and \code{K}.}
#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
#' of the confusion matrix obtained by grouping all the synthetic results for
#' the specified values of \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
#' super-population. The \code{data.frame} contains
#' those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{Call}}{ a \code{character} string representing the
#' super-population.}
#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing  the AUROC obtained for the
#' fixed values of super-population, \code{D} and \code{K}.}
#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
#' (the number of dimensions) for the specific profile.}
#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for the specific profile.}
#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
#' values (the number of dimensions) for the specific profile. More than one
#' \code{D} is possible.}
#' }
#' }
#' \item{\code{KNNSample}}{  a \code{data.frame} containing the inferred ancestry
#' for different values of \code{K} and \code{D}. The \code{data.frame}
#' contains those columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' }
#' }
#' \item{\code{KNNSynthetic}}{  a \code{data.frame} containing the inferred ancestry
#' for each synthetic data for different values of \code{K} and \code{D}.
#' The \code{data.frame}
#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop"
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current synthetic data.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' \item{\code{ref.superPop}}{ a \code{character} string representing the known
#' ancestry from the reference}
#' }
#' }
#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
#' ancestry for the current profile. The \code{data.frame} contains those
#' columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry.}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry.}
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry.}
#' }
#' }
#' }
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
#' #################################################################
#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file
#' ## need to be located in the same directory
#' ## Note that the 1KG GDS file used for this example is a
#' ## simplified version and CANNOT be used for any real analysis
#' #################################################################
#' path1KG <- file.path(dataDir, "tests")
#'
#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
#'
#' #################################################################
#' ## The Sample SNP pileup files (one per sample) need
#' ## to be located in the same directory.
#' #################################################################
#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz")
#'
#' #################################################################
#' ## The path where the Profile GDS Files (one per sample)
#' ## will be created need to be specified.
#' #################################################################
#' pathProfileGDS <- file.path(tempdir(), "out.tmp")
#'
#' ####################################################################
#' ## Fix seed to ensure reproducible results
#' ####################################################################
#' set.seed(3043)
#'
#' gds1KG <- snpgdsOpen(fileReferenceGDS)
#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
#' closefn.gds(gds1KG)
#'
#' ## Required library for this example to run correctly
#' if (requireNamespace("Seqinfo", quietly=TRUE) &&
#'      requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {
#'
#'     ## Chromosome length information
#'     ## chr23 is chrX, chr24 is chrY and chrM is 25
#'     chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]
#'
#'     \donttest{
#'
#'         res <- inferAncestryDNA(profileFile=demoProfileEx1,
#'             pathProfileGDS=pathProfileGDS,
#'             fileReferenceGDS=fileReferenceGDS,
#'             fileReferenceAnnotGDS=fileAnnotGDS,
#'             chrInfo=chrInfo,
#'             syntheticRefDF=dataRef,
#'             genoSource="snp-pileup")
#'
#'         unlink(pathProfileGDS, recursive=TRUE, force=TRUE)
#'
#'     }
#' }
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom utils write.csv
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
inferAncestryDNA <- function(profileFile, pathProfileGDS,
                          fileReferenceGDS, fileReferenceAnnotGDS,
                          chrInfo, syntheticRefDF,
                          genoSource=c("snp-pileup", "generic", "VCF", "bam"),
                          np=1L, verbose=FALSE) {

    return(inferAncestry(profileFile=profileFile, pathProfileGDS=pathProfileGDS,
                            fileReferenceGDS=fileReferenceGDS,
                            fileReferenceAnnotGDS=fileReferenceAnnotGDS,
                            chrInfo=chrInfo,
                            syntheticRefDF=syntheticRefDF,
                            genoSource=genoSource,
                            np=np, verbose=verbose))
}


#' @title Run most steps leading to the ancestry inference call on a specific
#' RNA profile
#'
#' @description This function runs most steps leading to the ancestry inference
#' call on a specific RNA profile. First, the function creates the
#' Profile GDS file for the specific profile using the information from a
#' RDS Sample description file and the Population Reference GDS file.
#'
#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID",
#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in
#' \code{character} strings (no factor). The \code{data.frame}
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy}
#' can be defined.
#'
#' @param studyDF a \code{data.frame} containing the information about the
#' study associated to the analysed sample(s). The \code{data.frame} must have
#' those 3 columns: "study.id", "study.desc", "study.platform". All columns
#' must be in \code{character} strings (no factor).
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Profile files will be created.
#' Default: \code{NULL}.
#'
#' @param pathGeno a \code{character} string representing the path to the
#' directory containing the VCF output of SNP-pileup for each sample. The
#' SNP-pileup files must be compressed (gz files) and have the name identifiers
#' of the samples. A sample with "Name.ID" identifier would have an
#' associated file called
#' if genoSource is "VCF", then "Name.ID.vcf.gz",
#' if genoSource is "generic", then "Name.ID.generic.txt.gz"
#' if genoSource is "snp-pileup", then "Name.ID.txt.gz".
#'
#' @param pathOut a \code{character} string representing the path to
#' the directory where the output files are created.
#'
#' @param fileReferenceGDS  a \code{character} string representing the file
#' name of the Population Reference GDS file. The file must exist.
#'
#' @param fileReferenceAnnotGDS a \code{character} string representing the
#' file name of the Population Reference GDS Annotation file. The file
#' must exist.
#'
#' @param chrInfo a \code{vector} of positive \code{integer} values
#' representing the length of the chromosomes. See 'details' section.
#'
#' @param syntheticRefDF a \code{data.frame} containing a subset of
#' reference profiles for each sub-population present in the Reference GDS
#' file. The \code{data.frame} must have those columns:
#' \describe{
#' \item{sample.id}{ a \code{character} string representing the sample
#' identifier. }
#' \item{pop.group}{ a \code{character} string representing the
#' subcontinental population assigned to the sample. }
#' \item{superPop}{ a \code{character} string representing the
#' super-population assigned to the sample. }
#' }
#'
#' @param genoSource a \code{character} string with two possible values:
#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param blockTypeID a \code{character} string corresponding to the block
#' type used to extract the block identifiers. The block type must be
#' present in the GDS Reference Annotation file.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return The integer \code{0L} when successful. See details section for
#' more information about the generated output files.
#'
#' @details
#'
#' The runExomeAncestry() function generates 3 types of files
#' in the OUTPUT directory.
#' \describe{
#' \item{Ancestry Inference}{ The ancestry inference CSV file
#' (".Ancestry.csv" file)}
#' \item{Inference Informaton}{ The inference information RDS file
#' (".infoCall.rds" file)}
#' \item{Synthetic Information}{ The parameter information RDS files
#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)}
#' }
#'
#' In addition, a sub-directory (named using the profile ID) is
#' also created.
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
#' #################################################################
#' ## Load the information about the profile
#' #################################################################
#' data(demoPedigreeEx1)
#' head(demoPedigreeEx1)
#'
#' #################################################################
#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file
#' ## need to be located in the same directory
#' ## Note that the 1KG GDS file used for this example is a
#' ## simplified version and CANNOT be used for any real analysis
#' #################################################################
#' path1KG <- file.path(dataDir, "tests")
#'
#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
#'
#' #################################################################
#' ## The Sample SNP pileup files (one per sample) need
#' ## to be located in the same directory.
#' #################################################################
#' pathGeno <- file.path(dataDir, "example", "snpPileup")
#'
#' #################################################################
#' ## The path where the Profile GDS Files (one per sample)
#' ## will be created need to be specified.
#' #################################################################
#' pathProfileGDS <- file.path(tempdir(), "out.tmp")
#'
#' pathOut <- file.path(tempdir(), "res.out")
#'
#' #################################################################
#' ## A data frame containing general information about the study
#' ## is also required. The data frame must have
#' ## those 3 columns: "studyID", "study.desc", "study.platform"
#' #################################################################
#' studyDF <- data.frame(study.id="MYDATA",
#'                         study.desc="Description",
#'                         study.platform="PLATFORM",
#'                         stringsAsFactors=FALSE)
#'
#' ####################################################################
#' ## Fix seed to ensure reproducible results
#' ####################################################################
#' set.seed(3043)
#'
#' gds1KG <- snpgdsOpen(fileReferenceGDS)
#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
#' closefn.gds(gds1KG)
#'
#' ## Required library for this example to run correctly
#' if (requireNamespace("Seqinfo", quietly=TRUE) &&
#'      requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {
#'
#'     ## Chromosome length information
#'     ## chr23 is chrX, chr24 is chrY and chrM is 25
#'     chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]
#'
#'     \donttest{
#'
#'         runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF,
#'             pathProfileGDS=pathProfileGDS,
#'             pathGeno=pathGeno,
#'             pathOut=pathOut,
#'             fileReferenceGDS=fileReferenceGDS,
#'             fileReferenceAnnotGDS=fileAnnotGDS,
#'             chrInfo=chrInfo,
#'             syntheticRefDF=dataRef,
#'             blockTypeID="GeneS.Ensembl.Hsapiens.v86",
#'             genoSource="snp-pileup")
#'
#'         unlink(pathProfileGDS, recursive=TRUE, force=TRUE)
#'         unlink(pathOut, recursive=TRUE, force=TRUE)
#'
#'     }
#' }
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom utils write.csv
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
runRNAAncestry <- function(pedStudy, studyDF, pathProfileGDS,
            pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS,
            chrInfo, syntheticRefDF,
            genoSource=c("snp-pileup", "generic", "VCF"), np=1L,
            blockTypeID, verbose=FALSE) {

    ## Validate parameters
    validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF,
        pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut,
        fileReferenceGDS=fileReferenceGDS,
        fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo,
        syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose)

    genoSource <- arg_match(genoSource)

    r <- runWrapperAncestry(pedStudy, studyDF, pathProfileGDS,
        pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS,
        chrInfo, syntheticRefDF, genoSource, studyType="RNA", np=np,
        blockTypeID=blockTypeID, verbose)

    ## Successful
    return(r)
}

#' @title Run most steps leading to the ancestry inference call on a specific
#' RNA profile
#'
#' @description This function runs most steps leading to the ancestry inference
#' call on a specific RNA profile. First, the function creates the
#' Profile GDS file for the specific profile using the information from a
#' RDS Sample description file and the Population Reference GDS file.
#'
#' @param profileFile a \code{character} string representing the path and the
#' file name of the genotype file or the bam if genoSource is snp-pileup the
#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Profile files will be created.
#' Default: \code{NULL}.
#'
#' @param fileReferenceGDS  a \code{character} string representing the file
#' name of the Population Reference GDS file. The file must exist.
#'
#' @param fileReferenceAnnotGDS a \code{character} string representing the
#' file name of the Population Reference GDS Annotation file. The file
#' must exist.
#'
#' @param chrInfo a \code{vector} of positive \code{integer} values
#' representing the length of the chromosomes. See 'details' section.
#'
#' @param syntheticRefDF a \code{data.frame} containing a subset of
#' reference profiles for each sub-population present in the Reference GDS
#' file. The \code{data.frame} must have those columns:
#' \describe{
#' \item{sample.id}{ a \code{character} string representing the sample
#' identifier. }
#' \item{pop.group}{ a \code{character} string representing the
#' subcontinental population assigned to the sample. }
#' \item{superPop}{ a \code{character} string representing the
#' super-population assigned to the sample. }
#' }
#'
#' @param genoSource a \code{character} string with four possible values:
#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param blockTypeID a \code{character} string corresponding to the block
#' type used to extract the block identifiers. The block type must be
#' present in the GDS Reference Annotation file.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{FALSE}.
#'
#' @return a \code{list} containing 4 entries:
#' \describe{
#' \item{\code{pcaSample}}{ a \code{list} containing the information related
#' to the eigenvectors. The \code{list} contains those 3 entries:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
#' the eigenvectors for the reference profiles.}
#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
#' eigenvectors for the current profile projected on the PCA from the
#' reference profiles.}
#' }
#' }
#' \item{\code{paraSample}}{ a \code{list} containing the results with
#' different \code{D} and \code{K} values that lead to optimal parameter
#' selection. The \code{list} contains those entries:
#' \describe{
#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
#' on all combined synthetic results done with a fixed value of \code{D} (the
#' number of dimensions). The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{median}}{ a \code{numeric} representing the median of the
#' minimum AUROC obtained (within super populations) for all combination of
#' the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
#' AUROC obtained (within super populations) for all combination of the fixed
#' \code{D} value and all tested \code{K} values. }
#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
#' of the minimum AUROC obtained (within super populations) for all
#' combination of the fixed \code{D} value and all tested \code{K} values. }
#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for a fixed \code{D} value. }
#' }
#' }
#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
#' all combined synthetic results done with different values of \code{D} (the
#' number of dimensions) and \code{K} (the number of neighbors).
#' The \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
#' obtained by grouping all the synthetic results by super-populations, for
#' the specified values of \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
#' by grouping all the synthetic results for the specified values of \code{D}
#' and \code{K}.}
#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
#' of the confusion matrix obtained by grouping all the synthetic results for
#' the specified values of \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
#' super-population. The \code{data.frame} contains
#' those columns:
#' \describe{
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions).}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors).}
#' \item{\code{Call}}{ a \code{character} string representing the
#' super-population.}
#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' \item{\code{AUROC}}{ a \code{numeric} representing  the AUROC obtained for
#' the fixed values of super-population, \code{D} and \code{K}.}
#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95%
#' confidence interval for the AUROC obtained for the fixed values of
#' super-population, \code{D} and \code{K}.}
#' }
#' }
#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
#' (the number of dimensions) for the specific profile.}
#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
#' (the number of neighbors) for the specific profile.}
#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
#' values (the number of dimensions) for the specific profile. More than one
#' \code{D} is possible.}
#' }
#' }
#' \item{\code{KNNSample}}{  a \code{data.frame} containing the inferred
#' ancestry for different values of \code{K} and \code{D}. The
#' \code{data.frame} contains those columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry for the specified \code{D} and \code{K} values.}
#' }
#' }
#' \item{\code{KNNSynthetic}}{  a \code{data.frame} containing the inferred
#' ancestry for each synthetic data for different values of \code{K} and
#' \code{D}.
#' The \code{data.frame}
#' contains those columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current synthetic data.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry. }
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry. }
#' \item{\code{infer.superPop}}{ a \code{character} string representing the
#' inferred ancestry for the specified \code{D} and \code{K} values.}
#' \item{\code{ref.superPop}}{ a \code{character} string representing the known
#' ancestry from the reference}
#' }
#' }
#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
#' ancestry for the current profile. The \code{data.frame} contains those
#' columns:
#' \describe{
#' \item{\code{sample.id}}{ a \code{character} string representing the unique
#' identifier of the current profile.}
#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
#' number of dimensions) used to infer the ancestry.}
#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
#' number of neighbors) used to infer the ancestry.}
#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred
#' ancestry.}
#' }
#' }
#' }
#'
#' @details
#'
#' The runExomeAncestry() function generates 3 types of files
#' in the OUTPUT directory.
#' \describe{
#' \item{Ancestry Inference}{ The ancestry inference CSV file
#' (".Ancestry.csv" file)}
#' \item{Inference Informaton}{ The inference information RDS file
#' (".infoCall.rds" file)}
#' \item{Synthetic Information}{ The parameter information RDS files
#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)}
#' }
#'
#' In addition, a sub-directory (named using the profile ID) is
#' also created.
#'
#' @references
#'
#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(SNPRelate)
#'
#' ## Path to the demo 1KG GDS file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
#'
#' #################################################################
#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file
#' ## need to be located in the same directory
#' ## Note that the 1KG GDS file used for this example is a
#' ## simplified version and CANNOT be used for any real analysis
#' #################################################################
#' path1KG <- file.path(dataDir, "tests")
#'
#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
#'
#' #################################################################
#' ## The Sample SNP pileup files (one per sample) need
#' ## to be located in the same directory.
#' #################################################################
#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz")
#'
#' #################################################################
#' ## The path where the Profile GDS Files (one per sample)
#' ## will be created need to be specified.
#' #################################################################
#' pathProfileGDS <- file.path(tempdir(), "out.tmp")
#'
#' ####################################################################
#' ## Fix seed to ensure reproducible results
#' ####################################################################
#' set.seed(3043)
#'
#' gds1KG <- snpgdsOpen(fileReferenceGDS)
#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L)
#' closefn.gds(gds1KG)
#'
#' ## Required library for this example to run correctly
#' if (requireNamespace("Seqinfo", quietly=TRUE) &&
#'      requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) {
#'
#'     ## Chromosome length information
#'     ## chr23 is chrX, chr24 is chrY and chrM is 25
#'     chrInfo <- Seqinfo::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25]
#'
#'     \donttest{
#'
#'         res <- inferAncestryGeneAware(profileFile=demoProfileEx1,
#'             pathProfileGDS=pathProfileGDS,
#'             fileReferenceGDS=fileReferenceGDS,
#'             fileReferenceAnnotGDS=fileAnnotGDS,
#'             chrInfo=chrInfo,
#'             syntheticRefDF=dataRef,
#'             blockTypeID="GeneS.Ensembl.Hsapiens.v86",
#'             genoSource="snp-pileup")
#'
#'         unlink(pathProfileGDS, recursive=TRUE, force=TRUE)
#'
#'     }
#' }
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom utils write.csv
#' @importFrom rlang arg_match
#' @encoding UTF-8
#' @export
inferAncestryGeneAware <- function(profileFile, pathProfileGDS,
                fileReferenceGDS, fileReferenceAnnotGDS,
                chrInfo, syntheticRefDF,
                genoSource=c("snp-pileup", "generic", "VCF", "bam"), np=1L,
                blockTypeID, verbose=FALSE) {

    profileBaseName <- basename(profileFile)
    pathGeno <- dirname(profileFile)

    genoSource <- arg_match(genoSource)

    # if(genoSource == "bam"){
    #     stop("The bam is not release yet look to get a \'Devel\' version ",
    #             "or contact us")
    # }

    profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE)
    for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){
        profileName <- gsub(extCur, "", profileName, ignore.case = TRUE)
    }
    #profileName <- "profile"
    studyDF <- data.frame(study.id="NotDef",
                          study.desc="NotDef",
                          study.platform="NotDef",
                          stringsAsFactors=FALSE)
    pedStudy <- data.frame(Name.ID=c(profileName),
                           Case.ID=c(profileName),
                           Sample.Type=c("RNA"),
                           Diagnosis="NotDef",
                           Source=c("NotDef"),
                           stringsAsFactors=FALSE)
    row.names(pedStudy) <- pedStudy$Name.ID

    ## Validate parameters
    validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF,
        pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut="./",
        fileReferenceGDS=fileReferenceGDS,
        fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo,
        syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose)


    if(genoSource %in% c("snp-pileup", "generic", "VCF", "bam")){

        r <- wrapperAncestry(pedStudy, studyDF, pathProfileGDS,
            profileFile, fileReferenceGDS, fileReferenceAnnotGDS,
            chrInfo, syntheticRefDF, genoSource=genoSource, studyType="GeneAware", np=np,
            blockTypeID=blockTypeID, verbose=verbose)
    }else{
        stop(paste0("The format ", genoSource," is not implemented yet\n"))
    }
    ## Successful
    return(r)
}

