#' Process binding profile inputs
#'
#' This internal function encapsulates the logic for loading binding profiles,
#' whether from file paths or from a list of GRanges objects. It performs
#' validation and returns a unified data.frame.
#'
#' @param binding_profiles_path Character vector. Path(s) to directories or file globs.
#' @param binding_profiles List of GRanges objects.
#' @return A dataframe containing the merged binding profiles.
#' @noRd
process_binding_profiles <- function(binding_profiles_path = NULL, binding_profiles = NULL) {
    # Validate that one and only one input type is provided
    if (is.null(binding_profiles_path) && is.null(binding_profiles)) {
        stop("Must supply one of binding_profiles_path or binding_profiles")
    }
    if (!is.null(binding_profiles_path) && !is.null(binding_profiles)) {
        stop("Provide only one of binding_profiles_path or binding_profiles, not both")
    }

    if (!is.null(binding_profiles_path)) {
        # Load from file paths
        message("Locating binding profile files")
        binding_files <- locate_files(binding_profiles_path, pattern = "\\.(bedgraph|bg)(\\.gz)?$")
        if (length(binding_files) == 0) {
            stop("No binding profile files (.bedgraph) found in the specified path(s).")
        }
        binding_profiles_data <- build_dataframes(binding_files)
    } else {
        # Load from a list of GRanges objects
        if (!is.list(binding_profiles) || is.null(names(binding_profiles))) {
            stop("binding_profiles must be a named list of GRanges objects")
        }
        if (!all(vapply(binding_profiles, function(x) is(x, "GRanges"), FUN.VALUE = logical(1)))) {
            stop("All binding_profiles list elements must be GRanges objects")
        }
        message("Building binding profile dataframe from supplied GRanges objects ...")
        binding_profiles_data <- build_dataframes_from_granges(binding_profiles)
    }

    # Convert back to GRanges post-merge
    binding_profiles_gr <- GenomicRanges::GRanges(
        seqnames = S4Vectors::Rle(binding_profiles_data$chr),
        ranges = IRanges::IRanges(start = binding_profiles_data$start, end = binding_profiles_data$end)
    )
    # Add the sample data to the metadata columns
    mcols(binding_profiles_gr) <- binding_profiles_data[, !(names(binding_profiles_data) %in% c("chr", "start", "end")), drop = FALSE]
    return(binding_profiles_gr)
}

#' Apply quantile normalisation
#'
#' This internal function applies quantile normalisation to a binding profile
#' data.frame if the user requests it.
#'
#' @param binding_profiles_data The data.frame of binding profiles.
#' @param quantile_norm Logical. If TRUE, normalisation is applied.
#' @return A data.frame, which is quantile-normalised if requested.
#' @noRd
apply_quantile_normalisation <- function(binding_profiles_data, quantile_norm) {
    if (isTRUE(quantile_norm)) {
        message("Applying quantile normalisation")
        signal_mat <- as.matrix(mcols(binding_profiles_data))

        if (ncol(signal_mat) == 0) {
            warning("No signal columns found for quantile normalisation. Returning original data.")
            return(binding_profiles_data)
        }

        qnorm_mat <- quantile_normalisation(signal_mat)
        colnames(qnorm_mat) <- paste0(colnames(signal_mat), "_qnorm")

        mcols(binding_profiles_data) <- as.data.frame(qnorm_mat)
    }
    return(binding_profiles_data)
}


#' Load genome-wide binding data and associated peak files or GRanges objects
#'
#' Reads DamID-seq log2 ratio binding data either from bedGraph files or
#' directly from a list of GRanges objects, and associated peak regions either
#' from GFF/bed files or from a list of GRanges objects.
#' This function is suitable for transcription factor binding analyses.
#' For peak discovery, use an external peak caller (e.g. 'find_peaks').
#'
#' One of `binding_profiles_path` or `binding_profiles` must be provided.
#' Similarly, one of `peaks_path` or `peaks` must be provided.
#'
#' When supplying GRanges lists, each GRanges should contain exactly one numeric
#' metadata column representing the binding signal, and all GRanges should be supplied
#' as a named list, with element names used as sample names.
#'
#' @param binding_profiles_path Character vector. Path(s) to directories or file
#'   globs containing log2 ratio binding tracks in bedGraph format. Wildcards ('*') supported.
#' @param peaks_path Character vector. Path(s) to directories or file globs containing
#'   the peak calls in GFF or BED format.
#' @param binding_profiles List of GRanges objects with binding profiles, one
#'   per sample.
#' @param peaks List of GRanges objects representing peak regions.
#' @param drop_samples A character vector of sample names or patterns to remove.
#'   Matching samples are removed from the analysis before normalisation and
#'   occupancy calculation. This can be useful for excluding samples that fail
#'   initial quality checks. Default: `NULL` (no samples are dropped).
#' @param maxgap_loci Integer, the maximum bp distance between a peak boundary
#'   and a gene to associate that peak with the gene. Default: 1000.
#' @param quantile_norm Logical (default: FALSE). If TRUE, quantile-normalise
#'   the signal columns across all datasets.
#' @param organism Organism string (lower case) to obtain genome annotation from
#'   (if not providing a custom `ensdb_genes` object)
#'   Default: "drosophila melanogaster".
#' @param ensdb_genes GRanges object: gene annotation. Automatically obtained
#'   from `organism` if NULL.
#' @param BPPARAM BiocParallel function (defaults to BiocParallel::bpparam())
#' @param plot_diagnostics Logical. If `TRUE` (the default in interactive sessions),
#'   diagnostic plots (PCA and correlation heatmap) will be generated and
#'   displayed for both the raw binding data and the summarised occupancy data.
#'
#' @return A list with components:
#'   \item{binding_profiles_data}{data.frame: Signal matrix for all regions, with columns chr, start, end, sample columns.}
#'   \item{peaks}{list(GRanges): All loaded peak regions from input files or directly supplied.}
#'   \item{pr}{GRanges: Reduced (union) peak regions across samples.}
#'   \item{occupancy}{data.frame: Binding values summarised over reduced peaks, with overlap annotations.}
#'   \item{test_category}{Character scalar; will be "bound".}
#'
#'
#' @examples
#' # Create a mock GRanges object for gene annotation
#' # This object, based on the package's unit tests, avoids network access
#' # and includes a very long gene to ensure overlaps with sample data.
#' mock_genes_gr <- GenomicRanges::GRanges(
#'     seqnames = S4Vectors::Rle("2L", 7),
#'     ranges = IRanges::IRanges(
#'         start = c(1000, 2000, 3000, 5000, 6000, 7000, 8000),
#'         end = c(1500, 2500, 3500, 5500, 6500, 7500, 20000000)
#'     ),
#'     strand = S4Vectors::Rle(GenomicRanges::strand(c("+", "-", "+", "+", "-", "-", "+"))),
#'     gene_id = c("FBgn001", "FBgn002", "FBgn003", "FBgn004", "FBgn005", "FBgn006", "FBgn007"),
#'     gene_name = c("geneA", "geneB", "geneC", "geneD", "geneE", "geneF", "LargeTestGene")
#' )
#'
#' # Get path to sample data files included with the package
#' data_dir <- system.file("extdata", package = "damidBind")
#'
#' # Run loading function using sample files and mock gene annotations
#' loaded_data <- load_data_peaks(
#'     binding_profiles_path = data_dir,
#'     peaks_path = data_dir,
#'     ensdb_genes = mock_genes_gr,
#'     quantile_norm = TRUE
#' )
#'
#' # View the structure of the output
#' str(loaded_data, max.level = 1)
#'
#' @export
load_data_peaks <- function(
        binding_profiles_path = NULL,
        peaks_path = NULL,
        binding_profiles = NULL,
        peaks = NULL,
        drop_samples = NULL,
        maxgap_loci = 1000,
        quantile_norm = FALSE,
        organism = "drosophila melanogaster",
        ensdb_genes = NULL,
        BPPARAM = BiocParallel::bpparam(),
        plot_diagnostics = interactive()) {
    if (is.null(ensdb_genes)) {
        ensdb_genes <- get_ensdb_genes(organism_keyword = organism)$genes
    }
    if (!is(ensdb_genes, "GRanges")) {
        stop("ensdb_genes must be supplied as a GRanges object.")
    }

    binding_profiles_data <- process_binding_profiles(binding_profiles_path, binding_profiles)

    # Validate and load peaks
    if (is.null(peaks_path) && is.null(peaks)) {
        stop("Must supply one of peaks_path or peaks")
    }
    if (!is.null(peaks_path) && !is.null(peaks)) {
        stop("Provide only one of peaks_path or peaks, not both")
    }

    if (!is.null(peaks_path)) {
        message("Locating peak files")
        peaks_files <- locate_files(peaks_path, pattern = "\\.(gff|bed)(\\.gz)?$")
        if (length(peaks_files) == 0) {
            stop("No peak files (.gff or .bed) found in the specified path(s).")
        }
        # Helper to strip all extensions for consistent naming with binding profiles
        strip_all_exts_recursive <- function(filepath) {
            lastpath <- ""
            while (filepath != lastpath) {
                lastpath <- filepath
                filepath <- file_path_sans_ext(filepath)
            }
            filepath
        }
        # Name the files so the resulting list is named.
        names(peaks_files) <- vapply(peaks_files, function(p) basename(strip_all_exts_recursive(p)), character(1))
        peaks <- lapply(peaks_files, import_peaks)
    } else {
        if (!is.list(peaks) || is.null(names(peaks))) {
            stop("peaks must be a named list of GRanges objects")
        }
        if (!all(vapply(peaks, function(x) is(x, "GRanges"), FUN.VALUE = logical(1)))) {
            stop("All peaks list elements must be GRanges objects")
        }
        message("Using supplied peaks GRanges list.")
    }

    # Consolidate loaded data for potential dropping
    temp_data_list <- list(
        binding_profiles_data = binding_profiles_data,
        peaks = peaks
    )

    # Drop samples if requested by the user.
    # This must happen before normalisation or peak reduction.
    if (!is.null(drop_samples)) {
        filtered_data <- ._drop_input_samples(temp_data_list, drop_samples)
        binding_profiles_data <- filtered_data$binding_profiles_data
        peaks <- filtered_data$peaks
    }

    binding_profiles_data <- apply_quantile_normalisation(binding_profiles_data, quantile_norm)

    # Process peaks and calculate occupancy
    pr <- reduce_regions(peaks)
    message("Calculating occupancy over peaks")
    occupancy <- calculate_occupancy(binding_profiles_data, pr, BPPARAM = BPPARAM)

    gene_overlaps <- all_overlaps_to_original(pr, ensdb_genes, maxgap = maxgap_loci)
    occupancy$gene_name <- gene_overlaps$genes
    if (!is.null(gene_overlaps$ids)) {
        occupancy$gene_id <- gene_overlaps$ids
    }

    result_list <- list(
        binding_profiles_data = binding_profiles_data,
        peaks = peaks,
        pr = pr,
        occupancy = occupancy,
        test_category = "bound"
    )
    if (isTRUE(plot_diagnostics)) {
        plot_input_diagnostics(result_list)
    }
    return(result_list)
}


#' Load genome-wide binding data for gene expression (RNA polymerase occupancy)
#'
#' Reads RNA Polymerase DamID binding profiles either from bedGraph files or
#' directly from a named list of GRanges objects. Calculates binding occupancy
#' summarised over genes.
#'
#' One of `binding_profiles_path` or `binding_profiles` must be provided.
#'
#' When supplying GRanges lists, each GRanges should contain exactly one numeric
#' metadata column representing the signal, and `binding_profiles` must be a
#' named list, with element names used as sample names.
#'
#' @param binding_profiles_path Character vector of directories or file globs
#'   containing log2 ratio binding tracks in bedGraph format. Wildcards ('*') supported.
#' @param binding_profiles Named list of GRanges objects representing binding profiles.
#' @param drop_samples A character vector of sample names or patterns to remove.
#'   Matching samples are removed from the analysis before normalisation and
#'   occupancy calculation. This can be useful for excluding samples that fail
#'   initial quality checks. Default: `NULL` (no samples are dropped).
#' @param quantile_norm Logical (default: FALSE) quantile-normalise across all
#'   signal columns if TRUE.
#' @param organism Organism string (lower case) to obtain genome annotation from (if not providing a custom `ensdb_genes` object)
#'   Defautls to "drosophila melanogaster".
#' @param calculate_fdr Calculate FDR based on RNA Pol occupancy (see details) (default: FALSE)
#' @param fdr_iterations Number of iterations to use to determine null model for FDR (default: 50000)
#' @param ensdb_genes GRanges object: gene annotation. Automatically obtained
#'   from `organism` if NULL.
#' @param BPPARAM BiocParallel function (defaults to BiocParallel::bpparam())
#' @param plot_diagnostics Logical. If `TRUE` (the default in interactive sessions),
#'   diagnostic plots (PCA and correlation heatmap) will be generated and
#'   displayed for both the raw binding data and the summarised occupancy data.
#'
#' @return List with elements:
#'   \item{binding_profiles_data}{data.frame of merged binding profiles, with chr, start, end, sample columns.}
#'   \item{occupancy}{data.frame of occupancy values summarised over genes.}
#'   \item{test_category}{Character scalar; will be "expressed".}
#'
#' @details
#' The algorithm for determining gene occupancy FDR (as a proxy for gene expression)
#' is based on `polii.gene.call`, which in turn was based on that described in
#' Southall et al. (2013). Dev Cell, 26(1), 101–12. doi:10.1016/j.devcel.2013.05.020.
#' Briefly, the algorithm establishes a null model by simulating the distribution of mean occupancy scores
#' from random fragments.  It fits a two-tiered regression to predict the False Discovery Rate (FDR), based
#' on fragment count and score. For each gene, the true weighted mean occupancy and fragment count are
#' calculated from the provided binding profile. Finally, the pre-computed regression models are used
#' to assign a specific FDR to each gene based on its observed occupancy and fragment count.
#'
#' @examples
#' # Create a mock GRanges object for gene annotations
#' # This object, based on the package's unit tests, avoids network access
#' # and includes a very long gene to ensure overlaps with sample data.
#' mock_genes_gr <- GenomicRanges::GRanges(
#'     seqnames = S4Vectors::Rle("2L", 7),
#'     ranges = IRanges::IRanges(
#'         start = c(1000, 2000, 3000, 5000, 6000, 7000, 8000),
#'         end = c(1500, 2500, 3500, 5500, 6500, 7500, 20000000)
#'     ),
#'     strand = S4Vectors::Rle(GenomicRanges::strand(c("+", "-", "+", "+", "-", "-", "+"))),
#'     gene_id = c("FBgn001", "FBgn002", "FBgn003", "FBgn004", "FBgn005", "FBgn006", "FBgn007"),
#'     gene_name = c("geneA", "geneB", "geneC", "geneD", "geneE", "geneF", "LargeTestGene")
#' )
#'
#' # Get path to sample data files included with the package
#' data_dir <- system.file("extdata", package = "damidBind")
#'
#' # Run loading function using sample files and mock gene annotations
#' # This calculates occupancy over genes instead of peaks.
#' loaded_data_genes <- load_data_genes(
#'     binding_profiles_path = data_dir,
#'     ensdb_genes = mock_genes_gr,
#'     quantile_norm = FALSE
#' )
#'
#' # View the head of the occupancy table
#' head(loaded_data_genes$occupancy)
#'
#' @export
load_data_genes <- function(
        binding_profiles_path = NULL,
        binding_profiles = NULL,
        drop_samples = NULL,
        quantile_norm = FALSE,
        organism = "drosophila melanogaster",
        calculate_fdr = FALSE,
        fdr_iterations = 50000,
        ensdb_genes = NULL,
        BPPARAM = BiocParallel::bpparam(),
        plot_diagnostics = interactive()) {
    if (is.null(ensdb_genes)) {
        ensdb_genes <- get_ensdb_genes(organism_keyword = organism)$genes
    }
    if (!is(ensdb_genes, "GRanges")) {
        stop("ensdb_genes must be supplied as a GRanges object.")
    }

    binding_profiles_data <- process_binding_profiles(binding_profiles_path, binding_profiles)

    # Drop samples if requested by the user.
    # This must happen before normalisation.
    if (!is.null(drop_samples)) {
        temp_data_list <- list(binding_profiles_data = binding_profiles_data)
        filtered_data <- ._drop_input_samples(temp_data_list, drop_samples)
        binding_profiles_data <- filtered_data$binding_profiles_data
    }

    binding_profiles_data <- apply_quantile_normalisation(binding_profiles_data, quantile_norm)

    # Calculate occupancy over genes
    occupancy <- calculate_occupancy(binding_profiles_data, ensdb_genes, BPPARAM = BPPARAM)

    # Optionally, calculate and add FDR columns
    if (isTRUE(calculate_fdr)) {
        occupancy <- calculate_and_add_fdr(
            binding_data = binding_profiles_data,
            occupancy_df = occupancy,
            fdr_iterations = fdr_iterations,
            BPPARAM = BPPARAM
        )
    }

    result_list <- list(
        binding_profiles_data = binding_profiles_data,
        occupancy = occupancy,
        test_category = "expressed"
    )
    if (isTRUE(plot_diagnostics)) {
        plot_input_diagnostics(result_list)
    }
    return(result_list)
}


#' Build data frames of binding profiles from bedGraph files
#'
#' Each file must be in bedGraph format: chr, start, end, value.
#' @param bedgraphs Character vector of file paths.
#' @return data.frame with merged intervals and all sample columns.
#' @noRd
build_dataframes <- function(bedgraphs) {
    message("Building binding profile dataframe from input files ...")
    if (length(bedgraphs) < 1) stop("No bedGraph files supplied.")

    # Helper function to deal with multiple periods in files
    strip_all_exts_recursive <- function(filepath) {
        lastpath <- ""
        while (filepath != lastpath) {
            lastpath <- filepath
            filepath <- file_path_sans_ext(filepath)
        }
        filepath
    }

    data.df <- NULL
    for (bf in bedgraphs) {
        # Name sample by filename (remove extension)
        sample_name <- basename(strip_all_exts_recursive(bf))
        gr <- import_bedgraph_as_df(bf, colname = sample_name)
        if (is.null(data.df)) {
            data.df <- gr
        } else {
            data.df <- merge(data.df, gr, by = c("chr", "start", "end"), all = FALSE)
        }
        message(" - Loaded: ", sample_name)
    }
    # Order by chromosome and location
    data.df <- data.df[order(data.df$chr, data.df$start), ]
    data.df
}


#' Build a merged binding profile dataframe from a named list of GRanges objects
#'
#' Each GRanges should have exactly one numeric metadata column representing the binding signal.
#' The list must be named, with element names used as sample names.
#'
#' @param gr_list Named list of GRanges objects with binding signal in numeric metadata column.
#' @return data.frame with merged intervals and columns: chr, start, end, sample columns.
#' @noRd
build_dataframes_from_granges <- function(gr_list) {
    if (length(gr_list) == 0) stop("Empty GRanges list supplied to build_dataframes_from_granges.")

    df_list <- lapply(seq_along(gr_list), function(i) {
        gr <- gr_list[[i]]
        sample_name <- names(gr_list)[i]

        if (!is(gr, "GRanges")) {
            stop("Element ", i, " of gr_list is not a GRanges object.")
        }

        # Detect numeric metadata columns
        mcols_gr <- mcols(gr)
        numeric_cols <- names(mcols_gr)[vapply(mcols_gr, is.numeric, FUN.VALUE = logical(1))]

        if (length(numeric_cols) == 0) {
            stop("GRanges object '", sample_name, "' has no numeric metadata column for binding signal.")
        } else if (length(numeric_cols) > 1) {
            stop("GRanges object '", sample_name, "' has multiple numeric metadata columns; please provide exactly one.")
        }
        value_col <- numeric_cols[1]

        df <- data.frame(
            chr = as.character(seqnames(gr)),
            start = start(gr),
            end = end(gr),
            value = mcols_gr[[value_col]],
            stringsAsFactors = FALSE
        )
        colnames(df)[4] <- sample_name
        return(df)
    })

    # Merge profiles
    merged_df <- Reduce(function(x, y) merge(x, y, by = c("chr", "start", "end"), all = FALSE), df_list)

    # Sort and return
    merged_df <- merged_df[order(merged_df$chr, merged_df$start), , drop = FALSE]
    return(merged_df)
}


#' Locate files with optional wildcard expansion
#' @param paths Character vector of directories or patterns (e.g., "/mypath/sample*").
#' @param pattern Optional regex for file extensions (e.g., "\\.gff$").  Case is ignored.
#' @return Character vector of file paths.
#' @noRd
locate_files <- function(paths, pattern = NULL) {
    files <- character()
    for (p in paths) {
        # Expand wildcards
        expanded <- Sys.glob(p)
        # List files in directory, if p is a bare directory
        if (dir.exists(p)) {
            dir_files <- list.files(p, pattern = pattern, full.names = TRUE)
            files <- c(files, dir_files)
        }
        # Check if expanded globs were dirs:
        for (checkexpdir in expanded) {
            if (dir.exists(checkexpdir)) {
                dir_files <- list.files(checkexpdir, pattern, full.names = TRUE)
                files <- c(files, dir_files)
            }
        }
        # Include expanded gene_name
        files <- c(files, expanded[grepl(pattern, expanded, ignore.case = TRUE)])
    }
    files <- unique(files)
    return(files[file.exists(files)])
}

#' Import a GFF file as a GRanges object
#' @param path File path (GFF/GTF)
#' @return GRanges
#' @noRd
import_peaks <- function(path) {
    tryCatch(
        {
            import(path)
        },
        error = function(e) {
            stop(sprintf("Failed to read peaks file '%s':\n%s", path, conditionMessage(e)))
        }
    )
}

#' Import a bedGraph file as a data.frame (chr, start, end, value)
#' @param path File path (bedGraph)
#' @param colname Name of the value column (usually sample name).
#' @return data.frame
#' @noRd
import_bedgraph_as_df <- function(path, colname = "score") {
    gr <- tryCatch(
        {
            import(path, format = "bedGraph")
        },
        error = function(e) {
            stop(sprintf("Failed to read bedGraph file '%s':\n%s", path, conditionMessage(e)))
        }
    )
    df <- as.data.frame(gr)[, c("seqnames", "start", "end", "score")]
    names(df) <- c("chr", "start", "end", colname)

    # Test for gapped offset (caused when loading closed rather than half-open datasets) and correct if present
    gaps <- df$start[-1] - df$end[-nrow(df)]
    tab <- table(gaps)
    if (as.integer(names(which.max(tab))) == 2) {
        # start of fragment(n+1) is always 2bp away from end of fragment(n): needs correcting
        df$end <- df$end + 1
    }

    df
}
