#' @title Processing Hi-C paired-end fastq files in R
#'
#' @name HiCool
#' @aliases importHiCoolFolder
#' @aliases getHiCoolArgs
#' @aliases getHicStats
#' 
#' @description 
#' `HiCool::HiCool()` automatically processes paired-end HiC sequencing files 
#' by performing the following steps: 
#' 
#' 1. Automatically setting up an appropriate conda environment using basilisk;  
#' 2. Mapping the reads to the provided genome reference using `hicstuff` and filtering of irrelevant pairs;  
#' 4. Filtering the resulting pairs file to remove unwanted chromosomes (e.g. chrM);  
#' 3. Binning the filtered pairs into a cool file at a chosen resolution;  
#' 5. Generating a multi-resolution mcool file;  
#' 6. Normalizing matrices at each resolution by iterative corretion using cooler.
#' 
#' The filtering strategy used by `hicstuff` is described in Cournac et al., BMC Genomics 2012. 
#' 
#' @section HiCool utils:
#' - `importHiCoolFolder(folder, hash)` automatically finds the different processed files 
#' associated with a specific HiCool::HiCool() processing hash ID.  
#' - getHiCoolArgs() parses the log file generated by HiCool::HiCool() during 
#' processing to recover which arguments were used.  
#' - getHicStats() parses the log file generated by HiCool::HiCool() during 
#' processing to recover pre-computed stats about pair numbers, filtering 
#' thresholds, etc.  
#' 
#' @param log Path to log file generated by hicstuff/hicool
#' @param r1 Path to fastq file (R1 read)
#' @param r2 Path to fastq file (R2 read)
#' @param genome Genome used to map the reads on, provided either 
#'   as a fasta file (in which case the bowtie2 index will be automatically 
#'   generated), or as a prefix to a bowtie2 index (e.g. `mm10` for 
#'   `mm10.*.bt2` files). Genome can also be a unique ID for the following 
#'   references: `hg38`, `mm10`, `dm6`, `R64-1-1`, `GRZc10`, `WBcel235`, 
#'   `Galgal4`.
#' @param binning First resolution used to bin the final mcool file 
#'   (Default: 10000 for `hg38` and `mm10`, 1000 for `dm6`, `R64-1-1`, ...)
#' @param restriction Restriction enzyme(s) used in HiC (Default: "DpnII,HinfI")
#' @param iterative Should the read mapping be performed iteratively? 
#'   (Default: TRUE)
#' @param balancing_args Balancing arguments for cooler. 
#' See `cooler` documentation [here](https://cooler.readthedocs.io/en/latest/cli.html#cooler-balance)
#' for a list of all available balancing arguments. 
#' These defaults match those used by the 4DN consortium. 
#' @param threads Number of CPUs used for parallelization. (Default: 1)
#' @param exclude_chr Chromosomes excluded from the final .mcool file. This will 
#'   not affect the pairs file. (Default: "Mito|chrM|MT")
#' @param output Path to output directory where processed files will 
#'   be created. (Default: `./HiCool`)
#' @param keep_bam Should the bam files be kept? (Default: FALSE)
#' @param build_report Should an automated report be computed? (Default: TRUE)
#' @param scratch Path to temporary directory where processing will take place. 
#'   (Default: `tempdir()`)
#' @param output Output folder used by HiCool.
#' @param hash Unique 6-letter ID used to identify files from a specific 
#'   HiCool processing run.
#' @param resolution Resolution used to import the mcool file
#' 
#' @return A `CoolFile` object with prefilled `pairsFile` and `metadata` slots.
#' 
#' @importClassesFrom HiCExperiment CoolFile
#' @importFrom HiCExperiment CoolFile
#' @importFrom stringr str_replace_all
#' @importFrom utils read.delim
#' @importFrom basilisk.utils createEnvironment
#' @importFrom reticulate use_condaenv
#' @importFrom reticulate import 
#' @importFrom reticulate py_capture_output
#' @export
#' 
#' @examples 
#' r1 <- HiContactsData::HiContactsData(sample = 'yeast_wt', format = 'fastq_R1')
#' r2 <- HiContactsData::HiContactsData(sample = 'yeast_wt', format = 'fastq_R2')
#' hcf <- HiCool(r1, r2, genome = 'R64-1-1', output = './HiCool/')
#' hcf
#' getHiCoolArgs(metadata(hcf)$log)
#' getHicStats(metadata(hcf)$log)
#' readLines(metadata(hcf)$log)
NULL

#' @rdname HiCool
#' @export 

HiCool <- function(
    r1 = '~/repos/tinyMapper/tests/testHiC_R1.fq.gz', 
    r2 = '~/repos/tinyMapper/tests/testHiC_R2.fq.gz', 
    genome = 'R64-1-1', 
    restriction = 'DpnII,HinfI', 
    binning = NULL, 
    iterative = TRUE, 
    balancing_args = " --min-nnz 10 --mad-max 5 ", 
    threads = 1L, 
    exclude_chr = 'Mito|chrM|MT', 
    output = 'HiCool', 
    keep_bam = FALSE, 
    build_report = TRUE, 
    scratch = tempdir()  
)
{
    ###############################################
    ## ------------- Correct paths ------------- ##
    ###############################################
    r1 <- normalizePath(r1)
    r2 <- normalizePath(r2)
    output <- normalizePath(output, mustWork = FALSE)
    genome <- .checkGenome(genome)

    ###############################################
    ## -------- Get path to python bins -------- ##
    ###############################################
    env_dir <- do.call(basilisk.utils::createEnvironment, HiCool_args)
    reticulate::use_condaenv(env_dir, required = TRUE)
    hs <- reticulate::import("hicstuff")

    ##############################################
    ## --------- Process reads ---------------- ##
    ###############################################
    hash <- .processFastq(
        hs = hs,
        r1 = r1, 
        r2 = r2, 
        genome = genome, 
        binning = binning, 
        restriction = restriction, 
        iterative = iterative, 
        balancing_args = balancing_args, 
        threads = threads, 
        output = output, 
        exclude_chr = exclude_chr, 
        keep_bam = keep_bam, 
        scratch = scratch  
    )
    hcf <- importHiCoolFolder(output, hash)
    message("HiCool :: .fastq to .mcool processing done!")
    message("HiCool :: Check ", output, "folder to find the generated files")

    ##################################################
    ## --------- Generate report ---------------- ##
    ##################################################
    if (build_report) {
        message("HiCool :: Generating HiCool report. This might take a while.")
        HiCReport(hcf)
    }
    message("HiCool :: All processing successfully achieved. Congrats!")

    return(hcf)
}

.processFastq <- function(
    hs,
    r1, 
    r2, 
    genome, 
    binning, 
    restriction, 
    iterative, 
    balancing_args, 
    threads, 
    output, 
    exclude_chr, 
    keep_bam, 
    scratch  
) {

    ##############################################
    ## ----------- Define variables ----------- ##
    ##############################################

    hash <- paste0(sample(c(LETTERS, 0:9), 6, replace = TRUE), collapse = '')
    tmp_folder <- file.path(scratch, hash)
    message("HiCool :: Initializing processing of fastq files [tmp folder: ", tmp_folder, "]...")
    prefix <- paste0(
        gsub('[._][rR][12].*', '', basename(r1)), 
        '^mapped-', gsub('.fa$', '', basename(genome)), 
        '^', hash
    )
    contact_map_mcool <- file.path(tmp_folder, paste0(prefix, '.mcool'))
    sinked_log <- file.path(tmp_folder, paste0(prefix, '.Rlog'))
    dir.create(tmp_folder, showWarnings = FALSE, recursive = TRUE)
    on.exit(unlink(tmp_folder))

    ###########################################################################
    ## ---- Automatically deduce appropriate binning if unspecified -------- ##
    ###########################################################################

    if (is.null(binning)) {
        binning <- ifelse(
            grepl('hg38|mm10', genome, ignore.case = TRUE), 
            10000,
            ifelse(
                grepl('dm6|R64-1-1|GRZc10|WBcel235|Galgal4', genome, ignore.case = TRUE), 
                1000, 
                stop(
                    "Please specify a binning resolution using the `binning` argument."
                )
            )
        )
    }

    ###############################################
    ## -------- Map reads with hicstuff -------- ##
    ###############################################

    message("HiCool :: Mapping fastq files...")
    hs$pipeline$full_pipeline(
        input1 = r1, 
        input2 = r2, 
        genome = genome, 
        enzyme = restriction, 
        filter_events = TRUE, 
        force = TRUE, 
        mapping = ifelse(iterative, "iterative", "normal"),
        binning = as.character(binning),
        exclude = gsub("\\|", ",", exclude_chr),
        no_cleanup = TRUE,
        out_dir = tmp_folder, 
        pcr_duplicates = TRUE, 
        plot = TRUE, 
        prefix = prefix, 
        threads = threads,
        distance_law = TRUE
    ) |> reticulate::py_capture_output() |> write(sinked_log)
    log_file <- list.files(tmp_folder, pattern = paste0(hash, '.hicstuff_'), full.names = TRUE)
    writeLines(c(
        paste0("HiCool working directory ::: ", getwd()),
        paste0("HiCool argument ::: r1: ", r1),
        paste0("HiCool argument ::: r2: ", r2),
        paste0("HiCool argument ::: genome: ", genome),
        paste0("HiCool argument ::: binning: ", binning),
        paste0("HiCool argument ::: restriction: ", restriction),
        paste0("HiCool argument ::: iterative: ", iterative),
        paste0("HiCool argument ::: balancing_args: ", balancing_args),
        paste0("HiCool argument ::: threads: ", threads),
        paste0("HiCool argument ::: output: ", output),
        paste0("HiCool argument ::: exclude_chr: ", exclude_chr),
        paste0("HiCool argument ::: keep_bam: ", keep_bam),
        paste0("HiCool argument ::: scratch: ", scratch),
        "----------------",
        readLines(log_file)
    ), log_file)

    ##########################################
    ## -------- Tidy-up everything -------- ##
    ##########################################

    message("HiCool :: Tidying up everything for you...")

    # Matrices
    dir.create(file.path(output, 'matrices'), showWarnings = FALSE, recursive = TRUE)
    file.copy(
        contact_map_mcool, 
        file.path(output, 'matrices', paste0(prefix, '.mcool'))
    )

    # Bam
    if (keep_bam) {
        dir.create(file.path(output, 'bam'), showWarnings = FALSE, recursive = TRUE)
        file.copy(
            file.path(tmp_folder, 'tmp', paste0(prefix, '.for.bam')), 
            file.path(output, 'bam', paste0(prefix, '.fwd.bam'))
        )
        file.copy(
            file.path(tmp_folder, 'tmp', paste0(prefix, '.rev.bam')), 
            file.path(output, 'bam', paste0(prefix, '.rev.bam'))
        )
    }

    # Pairs
    dir.create(file.path(output, 'pairs'), showWarnings = FALSE, recursive = TRUE)
    pairs_files <- list.files(tmp_folder, pattern = '.pairs', recursive = TRUE, full.names = TRUE)
    file.copy(
        pairs_files[which.max(nchar(pairs_files))],
        file.path(output, 'pairs', paste0(prefix, '.pairs'))
    )

    # Plots
    dir.create(file.path(output, 'plots'), showWarnings = FALSE, recursive = TRUE)
    file.copy(
        list.files(tmp_folder, pattern = paste0(hash, '_event_distance.pdf'), recursive = TRUE, full.names = TRUE), 
        file.path(output, 'plots', paste0(prefix, '_event_distance.pdf'))
    )
    file.copy(
        list.files(tmp_folder, pattern = paste0(hash, '_event_distribution.pdf'), recursive = TRUE, full.names = TRUE), 
        file.path(output, 'plots', paste0(prefix, '_event_distribution.pdf'))
    )

    # Log
    dir.create(file.path(output, 'logs'), showWarnings = FALSE, recursive = TRUE)
    file.copy(
        log_file,
        file.path(output, 'logs', paste0(prefix, '.log'))
    )

    return(hash)
}
