#' Import a VCF file and extract read count
#' @description
#' Import a VCF file and extract read-count and variant allele frequencies. Currently VCF files generated by `mutect2`, `strelka2` and `dkfz` are supported.
#'
#' @param vcf Input indexed VCF file.
#' @param ignore.XY Ignore allosomes. Default TRUE
#' @param t.sample Sample name for tumor. Must be same as in VCF. Strelka hardcodes tumor sample name to "TUMOR"
#' @param vcf.source Tool used for generating VCF file. Can be `strelka` or `mutect` or `dkfz` or `sentieon`
#' @param min.vaf Remove variants with vcf below threshold. Default 0.01
#' @param min.depth Minimum required depth for a variant to be considered. Default 30.
#' @param info.af The string encoding the allele frequency field in the FORMAT column. Defaults to `AF`and will be ignored if `vcf.source` != `sentieon`.
#' @param info.dp The string encoding the read depth field in the FORMAT column. Defaults to `DP`and will be ignored if `vcf.source` != `sentieon`.
#' @param filter.value The FILTER column value for variants that passed the filtering, defaults to PASS
#' @param filter.biallelic Remove biallelic variants. Default TRUE
#' @param filter.indels Remove indels. Default TRUE
#' @param ... further arguments and parameters passed to other
#' LACHESIS functions.
#' @examples
#' mutect_vcf <- system.file("extdata", "mutect.somatic.vcf.gz", package = "LACHESIS")
#' m_data <- readVCF(vcf = mutect_vcf, vcf.source = "mutect", filter.value = ".")
#' strelka_vcf <- system.file("extdata", "strelka2.somatic.snvs.vcf.gz", package = "LACHESIS")
#' s_data <- readVCF(vcf = strelka_vcf, vcf.source = "strelka")
#' dkfz_vcf <- system.file("extdata", "NBE15", "snvs_NBE15_somatic_snvs_conf_8_to_10.vcf", package = "LACHESIS")
#' d_data <- readVCF(vcf = dkfz_vcf, vcf.source = "dkfz")
#' @import data.table vcfR
#' @return a data.table with chrom, pos, ref, alt, t_ref_count, t_alt_count, t_depth, t_vaf
#' @export

readVCF <- function(vcf = NULL, ignore.XY = TRUE, vcf.source = "strelka",
                    min.vaf = 0.01, min.depth = 30, t.sample = NULL,
                    info.af = "AF", info.dp = "DP", filter.value = "PASS",
                    filter.biallelic = TRUE, filter.indels = TRUE, ...) {
    chrom <- t_vaf <- t_depth <- . <- pos <- ref <- alt <- t_ref_count <-
        t_alt_count <- NULL

    if (is.null(vcf)) {
        stop("Missing input VCF file!")
    }

    vcf.sources <- c("strelka", "mutect", "dkfz", "sentieon")
    vcf.source <- match.arg(
        arg = vcf.source, choices = vcf.sources,
        several.ok = FALSE
    )

    if (vcf.source == "dkfz") {
        dt <- .parse_dkfz(FNAME = vcf)

        if (is.null(t.sample)) {
            t.sample <- unlist(data.table::tstrsplit(
                x = basename(vcf),
                split = "snvs", keep = 2
            ))
            t.sample <- gsub(
                x = t.sample, pattern = "^_|_somatic_$|_$",
                replacement = ""
            )
            message("Assuming ", t.sample, " as tumor")
        }
    } else {
        if (is.null(t.sample)) {
            t.sample <- .get_t_SM(vcf = vcf)
            message("Assuming ", t.sample, " as tumor")
        } else {
            vcf.cols <- .get_vcf_cols(vcf = vcf)
            if (!any(grepl(t.sample, vcf.cols))) {
                t.sample <- .get_t_SM(vcf = vcf)
                message("Assuming ", t.sample, " as tumor")
            } else {
                t.sample <- vcf.cols[grepl(t.sample, vcf.cols)]
            }
        }

        message("Importing VCF..")
        v <- vcfR::read.vcfR(file = vcf, verbose = FALSE)
        message("Total variants         : ", nrow(v@fix))

        if (filter.value == ".") { # vcfR converts "." to NA
            v <- v[is.na(getFILTER(v)), ]
        } else {
            v <- v[getFILTER(v) == filter.value, ]
        }

        if (nrow(v@fix) == 0) {
            stop("No variants passed filtering!")
        }
        message("Variants passing filter: ", nrow(v@fix))

        if (filter.biallelic) {
            v <- v[vcfR::is.biallelic(x = v)] # Only keep biallelic variants
            if (nrow(v) == 0) {
                stop("No bi-allelic variants found!")
            }
            message("Bi-allelic variants    : ", nrow(v@fix))
        }

        if (filter.indels) {
            v <- v[!vcfR::is.indel(v)] # Remove INDELS (only SNVs)
            if (nrow(v) == 0) {
                stop("No single nucelotide variants found!")
            }
            message("single nucl. variants  : ", nrow(v@fix))
        }

        # convert vcf data to a data.frame
        if ("FORMAT" %in% colnames(v@gt)) {
            tum_format <- v@gt[, t.sample]
            vcf_df <- as.data.frame(data.table::tstrsplit(tum_format, split = ":"))
            colnames(vcf_df) <- unlist(data.table::tstrsplit(
                x =
                    v@gt[1, "FORMAT"],
                split = ":"
            ))
        } else if ("INFO" %in% colnames(v@fix)) {
            info_column <- vcfR::getINFO(v)
            vcf_df <- as.data.frame(data.table::tstrsplit(info_column,
                split = ";",
                type.convert = TRUE
            ))
            colnames(vcf_df) <- vapply(
                vcf_df[1, ],
                function(x) strsplit(x, "=")[[1]][1],
                character(1)
            )
            vcf_df <- as.data.frame(lapply(vcf_df, function(col) {
                vapply(
                    col, function(x) ifelse(grepl("=", x), sub(".*?=", "", x), NA),
                    character(1)
                )
            }), stringsAsFactors = FALSE)
        } else {
            stop("Please provide vcf file with FORMAT or INFO column.")
        }

        # Make a df of all necessary columns
        dt <- data.table::data.table(
            chrom = vcfR::getCHROM(v),
            pos = vcfR::getPOS(v), ref = vcfR::getREF(v),
            alt = vcfR::getALT(v)
        )
        dt <- cbind(dt, vcf_df)

        # Parse FORMAT field and get vaf, etc
        dt <- .get_depth_dt(d = dt, source = vcf.source)
    }

    # remove chr prefix
    if (grepl(pattern = "^chr", x = dt$chrom[1])) {
        dt$chrom <- gsub(pattern = "^chr", replacement = "", x = dt$chrom)
    }

    # Only analyze primary contigs (either with or without the chr prefix)
    primary_contigs <- c(seq_len(22), c("X", "Y"))
    dt <- dt[chrom %in% primary_contigs]
    message("Primary contig vars.   : ", nrow(dt))

    # Remove X and Y contigs
    if (ignore.XY) {
        dt <- dt[!chrom %in% c("chrX", "chrY", "X", "Y")]
        message("Autosomal variants     : ", nrow(dt))
    }

    # Filter for VAF and depth. Return only necessary columns
    message("Filtering for min.depth and VAF..")
    dt <- dt[t_vaf >= min.vaf][t_depth >= min.depth][, .(
        chrom, pos, ref,
        alt, t_ref_count,
        t_alt_count, t_depth,
        t_vaf
    )]
    data.table::setattr(x = dt, name = "t.sample", value = t.sample) # Add sample name as an attribute
    dt
}

.get_depth_dt <- function(d, source = "strelka") {
    AU <- CU <- GU <- TU <- t_vaf <- A <- t_depth <- t_ref_count <-
        t_alt_count <- G <- C <- chrom <- pos <- ref <- alt <- t_depth <-
        t_ref_count <- t_alt_count <- t_vaf <- . <- NULL

    if (source == "strelka") {
        d_dp <- apply(X = d[, .(AU, CU, GU, TU)], 2, function(x) {
            as.numeric(unlist(data.table::tstrsplit(x = x, split = ",", keep = 1)))
        })
        colnames(d_dp) <- gsub(pattern = "U$", replacement = "", x = colnames(d_dp))
        d_dp <- as.data.frame(d_dp)
        d_dp$t_depth <- rowSums(d_dp)
        d <- cbind(d, d_dp)

        # Strelka doesnt provide VAF or ref/alt count. Instead contains A,T,G,C depth
        # Match with alt allele and estimate VAF
        d <- split(d, d$alt)

        d <- lapply(seq_along(d), function(alt_idx) {
            alt_base <- names(d)[alt_idx]
            dalt <- d[[alt_idx]]

            if (alt_base == "A") {
                dalt[, t_vaf := A / t_depth]
                dalt[, t_ref_count := t_depth - A]
                dalt[, t_alt_count := A]
            } else if (alt_base == "T") {
                dalt[, t_vaf := `T` / t_depth]
                dalt[, t_ref_count := t_depth - `T`]
                dalt[, t_alt_count := `T`]
            } else if (alt_base == "G") {
                dalt[, t_vaf := G / t_depth]
                dalt[, t_ref_count := t_depth - G]
                dalt[, t_alt_count := G]
            } else if (alt_base == "C") {
                dalt[, t_vaf := C / t_depth]
                dalt[, t_ref_count := t_depth - C]
                dalt[, t_alt_count := C]
            }
            dalt
        })
        d <- data.table::rbindlist(l = d, use.names = TRUE, fill = TRUE)
    } else if (source == "mutect") {
        d_ad <- as.data.frame(data.table::tstrsplit(d$AD, split = ","))
        colnames(d_ad) <- c("t_ref_count", "t_alt_count")
        d_ad <- as.data.frame(apply(d_ad, 2, as.numeric))
        d_ad$t_depth <- rowSums(d_ad)
        d_ad$t_vaf <- d_ad$t_alt_count / d_ad$t_depth
        d <- cbind(d, d_ad)
    } else if (source == "sentieon") {
        d_ad <- data.frame(t_depth = d$DP, t_vaf = d$AF)
        d_ad <- as.data.frame(apply(d_ad, 2, as.numeric))
        d_ad$t_ref_count <- round(d_ad$t_depth * (1 - d_ad$t_vaf))
        d_ad$t_alt_count <- round(d_ad$t_depth * d_ad$t_vaf)
        d <- cbind(d, d_ad)
    } else {
        stop("Unknown format!")
    }

    d <- d[, .(chrom, pos, ref, alt, t_depth, t_ref_count, t_alt_count, t_vaf)]
    return(d[order(chrom, pos)])
}


# retrieve tumor sample ID
.get_t_SM <- function(vcf) {
    temp <- data.table::fread(file = vcf, skip = "#CHROM", nrows = 1)
    stdcols <- c(
        "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
        "INFO", "FORMAT"
    )
    sm_ids <- setdiff(colnames(temp), stdcols)
    # In case matched normal is used, second entry will always be the tumor sample, if not first sample is assumed to be tumor
    ifelse(length(sm_ids) > 1, yes = sm_ids[2], no = sm_ids[1])
}

# retrieve available column names in vcf file
.get_vcf_cols <- function(vcf) {
    temp <- data.table::fread(file = vcf, skip = "#CHROM", nrows = 1)
    stdcols <- c(
        "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
        "INFO", "FORMAT"
    )
    sm_ids <- setdiff(colnames(temp), stdcols)
    return(sm_ids)
}

# Parse DKFZ weird data format
.parse_dkfz <- function(FNAME) {
    . <- Chr <- Start <- Ref <- Alt <- t_ref_count <- t_alt_count <-
        t_depth <- t_vaf <- NULL

    v <- data.table::fread(
        file = FNAME, fill = TRUE, sep = "\t",
        header = TRUE, skip = "CHROM"
    )
    v <- v[, c(1, 2, 4, 5, 8)]
    colnames(v) <- c("Chr", "Start", "Ref", "Alt", "info_t")

    # Parse depth and vaf info from 6th column (tumor)
    tum_info_dt <- lapply(v$info_t, function(info_tum) {
        info_tum_spl <- data.table::tstrsplit(x = info_tum, split = ";")
        names(info_tum_spl) <- unlist(lapply(
            info_tum_spl,
            function(y) {
                data.table::tstrsplit(
                    y,
                    split = "=", keep = 1
                )
            }
        ))
        info_tum <- unlist(lapply(info_tum_spl, function(y) {
            data.table::tstrsplit(
                y,
                split = "=", keep = 2
            )
        }))

        t_dp4 <- as.numeric(unlist(data.table::tstrsplit(info_tum[c("DP4")],
            split = ","
        )))
        data.table::data.table(
            t_depth = sum(t_dp4),
            t_ref_count = sum(t_dp4[c(1, 2)]),
            t_alt_count = sum(t_dp4[c(3, 4)]),
            t_vaf = sum(t_dp4[c(3, 4)]) / sum(t_dp4)
        )
    })
    tum_info_dt <- data.table::rbindlist(tum_info_dt)

    v <- cbind(v, tum_info_dt)

    v <- v[, .(Chr, Start, Ref, Alt, t_ref_count, t_alt_count, t_depth, t_vaf)]
    v[, Chr := as.character(Chr)]
    v[, Start := as.numeric(Start)]
    colnames(v) <- c(
        "chrom", "pos", "ref", "alt", "t_ref_count", "t_alt_count",
        "t_depth", "t_vaf"
    )
    v
}
