
#' Reading in allele quantification data into SingleCellAlleleExperiment object
#'
#' @description
#' Main read in function for reading in allele quantification data and
#' loading the data into an `SingleCellAlleleExperiment` object.
#'
#' @details
#' The SingleCellAlleleExperiment data structure serves as a data representation
#' for data generated with the `scIGD` workflow.
#' This workflow allows for the quantification of expression and interactive
#' exploration of donor-specific alleles of different immune genes and its
#'
#' Input data are generated by the `scIGD` workflow is stored in a shared folder.
#' Expected naming scheme of the files from the data generating method:
#'
#'    * quantification matrix: `cells_x_genes.mtx`
#'    * barcode information: `cells_x_genes.barcodes.txt`
#'    * feature information: `cells_x_genes.genes.txt`
#'    * allele lookup table: `lookup_table.csv`
#'
#' File identifiers can be specifically stated if renamed.
#'
#' Optional features:
#'
#' - Filtering: Used parameter is `filter_mode`. Default filtering is performed
#' with a threshold=0 UMIs. `filter_mode="yes"` performs advanced filtering based
#' on ranking the barcodes and infering a inflection point of a
#' [**knee plot**](https://liorpachter.wordpress.com/tag/knee-plot/). Information
#' regarding the knee plot is exported in the `metadata(scae)[["knee_info"]]` slot
#' for later plotting (see vignette).
#' - Computing a `logcount` assay by normalizing the input data based on a
#' sizeFactor method recommended for single-cell data. Used parameter is
#' `log=TRUE/FALSE`.
#' - Computing additional gene symbols in case the input data only contains gene
#' identifiers represented as Ensembl ids. Used parameter is `gene_symbols=TRUE/FALSE`.
#'
#' @seealso [SingleCellAlleleExperiment]
## \code{\link{https://github.com/AGImkeller/scIGD/}},
## for information about the "single-cell ImmunoGenomic Diversity" **scIGD** workflow.
#'
#' @param samples_dir A character string determining the path to one directory
#' containing input files.
#' @param sample_names A character string for a sample identifier.
#' Can be used to describe the used dataset or sample.
#' @param filter_mode A vector containing three character strings that describe
#' different options for filtering. The value `"yes"` uses the inflection point
#' of the knee plot to filter out low-quality cells.
#' The default value `"no"` performs filtering on a `threshold=0`.
#' The value `"custom"` allows for setting a custom threshold in
#' the `filter_threshold` parameter.
#' @param lookup_file A character string determining the path to the lookup table.
#' @param barcode_file A character string determining the name of the file
#' containing the barcode identifiers.
#' @param gene_file A character string determining the name of the file
#' containing the feature identifiers.
#' @param matrix_file A character string determining the name of the file
#' containing the count matrix.
#' @param filter_threshold An integer value used as a threshold for filtering
#' low-quality barcodes/cells. Standard value is `NULL` when using `filter = c("yes", "no")`.
#' Value must be provided when using `filter = "custom"`.
#' @param log A logical parameter to decide if logcounts assay should be computed
#' based on library factors computed with `scuttle::computeLibraryFactors()`.
#' @param gene_symbols A logical parameter to decide whether to compute additional
#' gene gene symbols in case the raw data only contains ENSEMBL gene identifiers.
#' @param verbose A logical parameter to decide if additional runtime-messages
#' should be shown during function execution. Use `FALSE` if no info
#' runtime-messages should be shown (default), and `TRUE` for showing runtime-messages.
#' @param BPPARAM A BiocParallelParam object specifying how loading should be
#' parallelized for multiple samples.
#'
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom S4Vectors DataFrame ROWNAMES
#' @return A SingleCellAlleleExperiment object.
#' @examples
#' example_data_5k <- scaeData::scaeDataGet(dataset="pbmc_5k")
#' lookup_name <- "pbmc_5k_lookup_table.csv"
#' lookup <- read.csv(system.file("extdata", lookup_name, package="scaeData"))
#'
#'
#' # preflight mode, default filtering with a threshold of 0 UMI counts
#' scae_preflight <- read_allele_counts(example_data_5k$dir,
#'                           sample_names="example_data",
#'                           filter_mode="no",
#'                           lookup_file=lookup,
#'                           barcode_file=example_data_5k$barcodes,
#'                           gene_file=example_data_5k$features,
#'                           matrix_file=example_data_5k$matrix,
#'                           filter_threshold=NULL)
#'
#' scae_preflight
#'
#' # automatic filtering mode, filtering out low-quality cells
#' # on the inflection point of the knee plot
#' #scae_filtered <- read_allele_counts(example_data_5k$dir,
#' #                         sample_names="example_data",
#' #                         filter_mode="yes",
#' #                         lookup_file=lookup,
#' #                         barcode_file=example_data_5k$barcodes,
#' #                         gene_file=example_data_5k$features,
#' #                         matrix_file=example_data_5k$matrix,
#' #                         filter_threshold=NULL,
#' #                         verbose=TRUE)
#'
#' # scae_filtered
#'
#' # custom filtering mode, setting up a custom filter threshold for filtering
#' # scae_custom_filter <- read_allele_counts(example_data_5k$dir,
#' #                         sample_names="example_data",
#' #                         filter_mode="custom",
#' #                         lookup_file=lookup,
#' #                         barcode_file=example_data_5k$barcodes,
#' #                         gene_file=example_data_5k$features,
#' #                         matrix_file=example_data_5k$matrix,
#' #                         filter_threshold=200)
#'
#' # scae_custom_filter
#' @export
read_allele_counts <- function(samples_dir,
                               sample_names=names(samples_dir),
                               filter_mode=c("no", "yes", "custom"),
                               lookup_file=lookup,
                               barcode_file="cells_x_genes.barcodes.txt",
                               gene_file="cells_x_genes.genes.txt",
                               matrix_file="cells_x_genes.mtx",
                               filter_threshold=NULL,
                               log=FALSE,
                               gene_symbols=FALSE,
                               verbose=FALSE,
                               BPPARAM=BiocParallel::SerialParam()){

  ## input checks for optional package installation of optional functionalities
  if (filter_mode == "yes"){
    if (!requireNamespace("DropletUtils", quietly=TRUE)) {
      stop("Package 'DropletUtils' needed when using 'filter_mode=\"yes\"'.
           Install: BiocManager::install(\"DropletUtils\")")
    }
  }
  ## input checks for optional package installation of optional functionalities
  check_valid_optional_package(log=log, gene_symbols=gene_symbols)

  if (is.null(sample_names)) {
    sample_names <- samples_dir
  }

  if (filter_mode == "custom" & is.null(filter_threshold)) {
    stop("For custom filtering you need to state a integer value >0
         in the 'filter_threshold' parameter.")
  }

  ## reading in files
  load_out <- BiocParallel::bplapply(samples_dir,
                                     FUN=read_from_sparse_allele,
                                     barcode_file=barcode_file,
                                     gene_file=gene_file,
                                     matrix_file=matrix_file,
                                     BPPARAM=BPPARAM)

  current <- load_out[[1]]
  full_data <- current$mat
  feature_info <- current$feature_info
  cell_names <- current$cell_names
  exp_type <- current$exp_type

  ## prepare colData
  cell_info_list <- S4Vectors::DataFrame(Sample=rep(sample_names,
                                                    length(cell_names)),
                                         Barcode=cell_names$V1,
                                         row.names=NULL)
  ## prepare rowData
  rownames(feature_info) <- feature_info[,1]

  cnames <- cell_info_list$Barcode
  colnames(full_data) <- cnames

  full_data <- as(full_data, "CsparseMatrix")

  lookup <- lookup_file
  inflection_threshold <- 0
  knee_list <- NULL

  switch(filter_mode,
         "no"=message("Filtering performed on default value at 0 UMI counts."),
         "yes"={knee_list <- get_knee_info(full_data, feature_info, cell_names)
               inflection_threshold <- knee_list$inflection_point
               message("Filtering performed based on the inflection point at: ",
               inflection_threshold, " UMI counts.")},
         "custom"=inflection_threshold <- filter_threshold)

  if (verbose){
    message("Data Read_in completed")
  }

  scae <- SingleCellAlleleExperiment(assays=list(counts=full_data),
                                     rowData=feature_info,
                                     colData=cell_info_list,
                                     metadata=knee_list,
                                     threshold=inflection_threshold,
                                     exp_type=exp_type,
                                     lookup=lookup,
                                     log=log,
                                     gene_symbols=gene_symbols,
                                     verbose=verbose)

  if (verbose){
    message("SingleCellAlleleExperiment object completed")
  }
  return(scae)
}


# Inspired from https://github.com/MarioniLab/DropletUtils/blob/devel/R/read10xCounts.R
#' Reading in allele-aware quantification data
#'
#' @description
#' Internal function used in `read_allele_counts()` that reads in the data
#' stated in the given directory path.
#'
#' @param path A character string determining the path to the directory
#' containing the input files.
#' @param barcode_file A character string determining the name of the file
#' containing the sample-tag quantification data.
#' @param gene_file A character string determining the name of the file
#' containing the feature identifiers.
#' @param matrix_file A character string determining the name of the file
#' containing the count matrix.
#'
#' @importFrom utils read.delim read.csv
#' @importFrom Matrix readMM t
#' @return A list with three data.frames containing the input data information.
read_from_sparse_allele <- function(path,
                                    barcode_file,
                                    gene_file,
                                    matrix_file){

  barcode_loc <- file.path(path, barcode_file)
  feature_loc <- file.path(path, gene_file)
  matrix_loc  <- file.path(path, matrix_file)

  feature_info <- utils::read.delim(feature_loc, header=FALSE)
  cell_names   <- utils::read.csv(barcode_loc, sep="", header=FALSE)
  mat          <- Matrix::readMM(matrix_loc)

  possible_names <- c("Ensembl_ID", "Symbol")

  if (grepl("ENS", feature_info$V1[1])){
    exp_type <- "ENS"
    colnames(feature_info) <- possible_names[1]
  }else{
    exp_type <- "noENS"
    colnames(feature_info) <- possible_names[2]
  }

  list(mat=Matrix::t(mat),
       cell_names=cell_names,
       feature_info=feature_info,
       exp_type=exp_type)
}

#' Knee plot info
#'
#' @description
#' Creates a knee plot information, ranking the barcodes according to their
#' total UMI count. The information is later on passed to the
#' `metadata(scae)[["knee_info"]]` slot.
#'
#' @param matrix A sparse \code{\link[Matrix]{Matrix}} object containing
#' the quantification data.
#' @param genes A data.frame object containing gene identifiers.
#' @param barcodes A data.frame object containing barcode identifiers.
#'
#' @importFrom S4Vectors metadata
#'
#' @return A list including a data.frame with barcode rank information,
#' the corresponding knee and inflection point.
get_knee_info <- function(matrix, genes, barcodes){

  barcodes <- barcodes
  features <- genes
  matrix <- matrix

  ## advanced knee plot
  br_out <- DropletUtils::barcodeRanks(matrix)
  knee_point <- S4Vectors::metadata(br_out)$knee
  inflection_point <- S4Vectors::metadata(br_out)$inflection

  knee_list <- c(knee_df=br_out,
                 knee_point=knee_point,
                 inflection_point=inflection_point)
  return(knee_list)
}

