#'         `max.length`, whose columns sum to 1, **or**
#'   \item A `data.frame` with columns *position*, *residue*, *frequency* when
#'         `tidy = TRUE`.
#' }
#' @examples
#' set.seed(1)
#' seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
#' rel.freq <- calculateFrequency(seqs)
#' head(rel.freq[, 1:5])
#'
#' ## Nucleotide example
#' dna <- c("ATGCC", "ATGAC", "ATGGC")
#' calculateFrequency(dna,
#'                    sequence.dictionary = c("A","C","G","T"),
#'                    padding.symbol = "-",
#'                    tidy = TRUE)
#'
#' @export
calculateFrequency <- function(sequences,
max.length = NULL,
sequence.dictionary = amino.acids,
padding.symbol = ".",
tidy = FALSE) {
stopifnot(is.character(sequences),
nchar(padding.symbol) == 1L,
padding.symbol %!in% sequence.dictionary)
## 1. Pad to a rectangular character matrix
if (is.null(max.length))
max.length <- max(nchar(sequences), 1L)
padded <- .padded.strings(sequences,
max.length = max.length,
padded.token = padding.symbol,
concatenate = TRUE)
seq_mat <- do.call(rbind, strsplit(unlist(padded), ""))  # nSeq × max.length
## 2. Column-wise totals excluding padding
non_pad_tot <- colSums(seq_mat != padding.symbol)        # length = max.length
## 3.  Fast frequency calculation (one pass per residue)
res_mat <- matrix(0,
nrow = length(sequence.dictionary),
ncol = max.length,
dimnames = list(sequence.dictionary,
paste0("Pos.", seq_len(max.length))))
for (residue in sequence.dictionary) {
# logical comparison is vectorised; colSums is C-level
res_mat[residue, ] <- colSums(seq_mat == residue) / non_pad_tot
}
## 4.  Optional tidy reshaping
if (tidy) {
res_mat <- as.data.frame(as.table(res_mat),
stringsAsFactors = FALSE,
responseName = "frequency")
names(res_mat) <- c("residue", "position", "frequency")
res_mat$position <- as.integer(sub("Pos\\.", "", res_mat$position))
}
res_mat
}
## -------------------------------------------------------------------------
test_that("matrix dimensions & names are correct (AA default)", {
seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_equal(dim(res), c(length(aa_20), max(nchar(seqs))))
expect_equal(rownames(res), aa_20)
expect_true(cols_sum_to_one(res))
})
seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_equal(dim(res), c(length(aa_20), max(nchar(seqs))))
dim(res)
## Canonical residue sets ---------------------------------------------------
aa_20 <- c("A","C","D","E","F","G","H","I","K","L",
"M","N","P","Q","R","S","T","V","W","Y")
dna_4 <- c("A","C","G","T")
# Helper to check column sums ≈ 1 -----------------------------------------
cols_sum_to_one <- function(mat, tol = 1e-12)
cols_sum_to_one <- function(mat, tol = 1e-12)
all(abs(colSums(mat) - 1) < tol)
seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_equal(dim(res), c(length(aa_20), max(nchar(seqs))))
expect_equal(rownames(res), aa_20)
rownames(res)
aa_20
expect_contains(rownames(res), aa_20)
expect_true(cols_sum_to_one(res))
## -------------------------------------------------------------------------
test_that("matrix dimensions & names are correct (AA default)", {
seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_equal(dim(res), c(length(aa_20), max(nchar(seqs))))
expect_contains(rownames(res), aa_20)
expect_true(cols_sum_to_one(res))
})
## -------------------------------------------------------------------------
test_that("works with nucleotide alphabet & custom padding", {
dna <- c("ATGCC", "ATGAC", "ATGGC")
pad <- "-"
res <- calculateFrequency(dna,
sequence.dictionary = dna_4,
padding.symbol = pad)
expect_equal(dim(res), c(4, 5))
expect_setequal(rownames(res), dna_4)
expect_identical(colSums(res), rep(1, 5))
})
dna <- c("ATGCC", "ATGAC", "ATGGC")
pad <- "-"
res <- calculateFrequency(dna,
sequence.dictionary = dna_4,
padding.symbol = pad)
expect_equal(dim(res), c(4, 5))
expect_setequal(rownames(res), dna_4)
expect_identical(colSums(res), rep(1, 5))
colSums(res)
rep(1, 5)
expect_identical(as.vector(colSums(res)), rep(1, 5))
## -------------------------------------------------------------------------
test_that("`tidy = TRUE` agrees with matrix output", {
seqs <- c("AAA", "AAC")
mat  <- calculateFrequency(seqs, max.length = 3)
tidy <- calculateFrequency(seqs, max.length = 3, tidy = TRUE)
# reshape matrix for comparison
mat_long <- as.data.frame(as.table(mat),
stringsAsFactors = FALSE,
responseName = "frequency")
names(mat_long) <- c("residue", "position", "frequency")
mat_long$position <- as.integer(sub("Pos\\.", "", mat_long$position))
expect_equal(
tidy[order(tidy$residue, tidy$position), ],
mat_long[order(mat_long$residue, mat_long$position), ],
tolerance = 1e-12
)
})
## -------------------------------------------------------------------------
test_that("`max.length` shorter than longest sequence truncates counts", {
seqs <- c("ABCDEFG", "ABCD")
res5 <- calculateFrequency(seqs, max.length = 5)  # keep first five
res7 <- calculateFrequency(seqs, max.length = 7)  # full length
expect_equal(dim(res5), c(length(aa_20), 5))
expect_true(cols_sum_to_one(res5))
# Proportions for first 4 positions must match between the two runs
expect_equal(res5[, 1:4], res7[, 1:4])
})
seqs <- c("ABCDEFG", "ABCD")
res5 <- calculateFrequency(seqs, max.length = 5)  # keep first five
res5 <- calculateFrequency(seqs, max.length = 5)  # keep first five
res7 <- calculateFrequency(seqs, max.length = 7)  # full length
expect_equal(dim(res5), c(length(aa_20), 5))
expect_true(cols_sum_to_one(res5))
## -------------------------------------------------------------------------
test_that("non-character input triggers error", {
expect_error(calculateFrequency(1:5), "is.character")
})
test_that("padding symbol collision detected", {
expect_error(
calculateFrequency(c("AA"), padding.symbol = "A"),
"padding.symbol"
)
})
## -------------------------------------------------------------------------
test_that("unknown residues are ignored but columns remain normalised", {
seqs <- c("XYZ", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Y", "Z") %in% rownames(res)))
# Because unknowns appear, column sums should be < 1
expect_true(all(colSums(res) < 1))
})
seqs <- c("XYZ", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
res
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Y", "Z") %in% rownames(res)))
seqs <- c("XZ", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Y", "Z") %in% rownames(res)))
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Z") %in% rownames(res)))
# Because unknowns appear, column sums should be < 1
expect_true(all(colSums(res) < 1))
all(colSums(res)
)
all(colSums(res) < 1))
all(colSums(res) < 1)
colSums(res)
seqs <- c("XZA", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Z") %in% rownames(res)))
# Because unknowns appear, column sums should be < 1
expect_true(all(colSums(res) <=1))
## -------------------------------------------------------------------------
test_that("unknown residues are ignored but columns remain normalised", {
seqs <- c("XZA", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Z") %in% rownames(res)))
# Because unknowns appear, column sums should be < 1
expect_true(all(colSums(res) <=1))
})
## -------------------------------------------------------------------------
test_that("single-sequence edge case returns sensible output", {
seqs <- "CASSQETQYF"
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_true(cols_sum_to_one(res))
# Frequencies must be 1.0 at positions matching residues, 0 elsewhere
hits <- which(seqs == aa_20[col(res)], arr.ind = TRUE)
expect_equal(res[cbind(seqsplit(seqs, ""), seq_len(nchar(seqs)))], rep(1, nchar(seqs)))
})
seqs <- "CASSQETQYF"
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_true(cols_sum_to_one(res))
# Frequencies must be 1.0 at positions matching residues, 0 elsewhere
hits <- which(seqs == aa_20[col(res)], arr.ind = TRUE)
expect_equal(res[cbind(seqsplit(seqs, ""), seq_len(nchar(seqs)))], rep(1, nchar(seqs)))
# Frequencies must be 1.0 at positions matching residues, 0 elsewhere
hits <- which(seqs == aa_20[col(res)], arr.ind = TRUE)
expect_equal(res[cbind(seqsplit(seqs, ""), seq_len(nchar(seqs)))], rep(1, nchar(seqs)))
# test script for calculateFrequency.R - testcases are NOT comprehensive!
## Canonical residue sets ---------------------------------------------------
aa_20 <- c("A","C","D","E","F","G","H","I","K","L",
"M","N","P","Q","R","S","T","V","W","Y")
dna_4 <- c("A","C","G","T")
# Helper to check column sums ≈ 1 -----------------------------------------
cols_sum_to_one <- function(mat, tol = 1e-12)
all(abs(colSums(mat) - 1) < tol)
## -------------------------------------------------------------------------
test_that("matrix dimensions & names are correct (AA default)", {
seqs <- c("CASSLGQGAETQYF", "CASSPGQGDYEQYF", "CASSQETQYF")
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_equal(dim(res), c(length(aa_20), max(nchar(seqs))))
expect_contains(rownames(res), aa_20)
expect_true(cols_sum_to_one(res))
})
## -------------------------------------------------------------------------
test_that("works with nucleotide alphabet & custom padding", {
dna <- c("ATGCC", "ATGAC", "ATGGC")
pad <- "-"
res <- calculateFrequency(dna,
sequence.dictionary = dna_4,
padding.symbol = pad)
expect_equal(dim(res), c(4, 5))
expect_setequal(rownames(res), dna_4)
expect_identical(as.vector(colSums(res)), rep(1, 5))
})
## -------------------------------------------------------------------------
test_that("`tidy = TRUE` agrees with matrix output", {
seqs <- c("AAA", "AAC")
mat  <- calculateFrequency(seqs, max.length = 3)
tidy <- calculateFrequency(seqs, max.length = 3, tidy = TRUE)
# reshape matrix for comparison
mat_long <- as.data.frame(as.table(mat),
stringsAsFactors = FALSE,
responseName = "frequency")
names(mat_long) <- c("residue", "position", "frequency")
mat_long$position <- as.integer(sub("Pos\\.", "", mat_long$position))
expect_equal(
tidy[order(tidy$residue, tidy$position), ],
mat_long[order(mat_long$residue, mat_long$position), ],
tolerance = 1e-12
)
})
## -------------------------------------------------------------------------
test_that("non-character input triggers error", {
expect_error(calculateFrequency(1:5), "is.character")
})
test_that("padding symbol collision detected", {
expect_error(
calculateFrequency(c("AA"), padding.symbol = "A"),
"padding.symbol"
)
})
## -------------------------------------------------------------------------
test_that("unknown residues are ignored but columns remain normalised", {
seqs <- c("XZA", "AAA")
res  <- calculateFrequency(seqs, max.length = 3)
# Unknowns (X,Y,Z) should contribute zero rows (already absent)
expect_false(any(c("X", "Z") %in% rownames(res)))
# Because unknowns appear, column sums should be < 1
expect_true(all(colSums(res) <=1))
})
## -------------------------------------------------------------------------
test_that("single-sequence edge case returns sensible output", {
seqs <- "CASSQETQYF"
res  <- calculateFrequency(seqs)
expect_true(is.matrix(res))
expect_true(cols_sum_to_one(res))
})
testthat::test_local()
testthat::test_local()
# test script for diversity.R - testcases are NOT comprehensive!
## ------------------------------------------------------------------
## 1. Analytic reference checks
## ------------------------------------------------------------------
counts <- c(A = 2, B = 2)
test_that("Shannon matches ln2 for two equal categories", {
expect_equal(shannon_entropy(counts), log(2), tolerance = 1e-12)
})
test_that("Inverse Simpson gives 2 for equal split", {
expect_equal(inv_simpson(counts), 2)
})
test_that("Gini–Simpson complements inverse Simpson", {
gs <- gini_simpson(counts)
expect_equal(gs, 1 - 1 / inv_simpson(counts))
expect_equal(gs, 0.5)
})
test_that("Normalised entropy and Pielou evenness equal 1 under even split", {
expect_equal(norm_entropy(counts), 1)
expect_equal(pielou_evenness(counts), 1)
})
## ------------------------------------------------------------------
## 2. Edge-case handling
## ------------------------------------------------------------------
single <- c(Solo = 7)
test_that("All metrics return 0/1 for single category", {
expect_equal(shannon_entropy(single), 0)
expect_equal(inv_simpson(single),     1)
expect_equal(gini_simpson(single),    0)
expect_equal(norm_entropy(single),    0)
expect_equal(pielou_evenness(single), 0)
expect_equal(hill_q(0)(single),       1)  # richness
})
zeros <- c(A = 3, B = 0, C = 2)
test_that("Zero counts are ignored", {
# effective counts = c(3,2)
expect_equal(shannon_entropy(zeros),
shannon_entropy(c(3,2)))
})
neg <- c(A = 5, B = -1, C = 1)
test_that("Negative counts are silently discarded (cnt > 0 filter)", {
expect_equal(inv_simpson(neg), inv_simpson(c(5,1)))
})
## ------------------------------------------------------------------
## 3. Hill-number relationships
## ------------------------------------------------------------------
hill_vec <- c(10, 5, 1)   # uneven distribution
H  <- shannon_entropy(hill_vec)
p2 <- inv_simpson(hill_vec)
test_that("Hill numbers reproduce known equivalents", {
expect_equal(hill_q(0)(hill_vec), length(hill_vec[hill_vec > 0]))
expect_equal(hill_q(1)(hill_vec), exp(H),   tolerance = 1e-12)
expect_equal(hill_q(2)(hill_vec), p2,       tolerance = 1e-12)
})
test_that("hill_q returns a closure that errors for non-numeric q", {
expect_error(hill_q("a"), regexp = "non-numeric")
})
expect_error(hill_q("a"), regexp = "non-numeric")
hill_q("a")
# test script for diversity.R - testcases are NOT comprehensive!
## ------------------------------------------------------------------
## 1. Analytic reference checks
## ------------------------------------------------------------------
counts <- c(A = 2, B = 2)
test_that("Shannon matches ln2 for two equal categories", {
expect_equal(shannon_entropy(counts), log(2), tolerance = 1e-12)
})
test_that("Inverse Simpson gives 2 for equal split", {
expect_equal(inv_simpson(counts), 2)
})
test_that("Gini–Simpson complements inverse Simpson", {
gs <- gini_simpson(counts)
expect_equal(gs, 1 - 1 / inv_simpson(counts))
expect_equal(gs, 0.5)
})
test_that("Normalised entropy and Pielou evenness equal 1 under even split", {
expect_equal(norm_entropy(counts), 1)
expect_equal(pielou_evenness(counts), 1)
})
## ------------------------------------------------------------------
## 2. Edge-case handling
## ------------------------------------------------------------------
single <- c(Solo = 7)
test_that("All metrics return 0/1 for single category", {
expect_equal(shannon_entropy(single), 0)
expect_equal(inv_simpson(single),     1)
expect_equal(gini_simpson(single),    0)
expect_equal(norm_entropy(single),    0)
expect_equal(pielou_evenness(single), 0)
expect_equal(hill_q(0)(single),       1)  # richness
})
zeros <- c(A = 3, B = 0, C = 2)
test_that("Zero counts are ignored", {
# effective counts = c(3,2)
expect_equal(shannon_entropy(zeros),
shannon_entropy(c(3,2)))
})
neg <- c(A = 5, B = -1, C = 1)
test_that("Negative counts are silently discarded (cnt > 0 filter)", {
expect_equal(inv_simpson(neg), inv_simpson(c(5,1)))
})
## ------------------------------------------------------------------
## 3. Hill-number relationships
## ------------------------------------------------------------------
hill_vec <- c(10, 5, 1)   # uneven distribution
H  <- shannon_entropy(hill_vec)
p2 <- inv_simpson(hill_vec)
test_that("Hill numbers reproduce known equivalents", {
expect_equal(hill_q(0)(hill_vec), length(hill_vec[hill_vec > 0]))
expect_equal(hill_q(1)(hill_vec), exp(H),   tolerance = 1e-12)
expect_equal(hill_q(2)(hill_vec), p2,       tolerance = 1e-12)
})
## ------------------------------------------------------------------
## 4. Vectorization & length-one inputs
## ------------------------------------------------------------------
test_that("Functions accept length-one numeric and give scalar output", {
expect_equal(shannon_entropy(5), 0)
expect_equal(norm_entropy(5),    0)
})
## ------------------------------------------------------------------
## 5. Large random vectors
## ------------------------------------------------------------------
set.seed(42)
large_cnt <- sample(1:100, 100, TRUE)
test_that("All metrics return finite positive numbers for large counts", {
expect_true(is.finite(shannon_entropy(large_cnt)))
expect_true(inv_simpson(large_cnt)  > 0)
expect_true(gini_simpson(large_cnt) >= 0 && gini_simpson(large_cnt) <= 1)
expect_true(norm_entropy(large_cnt) >= 0 && norm_entropy(large_cnt) <= 1)
expect_true(pielou_evenness(large_cnt) >= 0 && pielou_evenness(large_cnt) <= 1)
})
## -------------------------------------------------------------------
##  5. argument validation errors
## -------------------------------------------------------------------
test_that("invalid inputs raise informative errors", {
expect_error(calculateEntropy(1:5),           "character")   # non-char input
expect_error(calculateEntropy(toy,
method = "banana"),
regexp =  "should be one of")
expect_error(calculateEntropy(toy,
padding.symbol = "XX"),
"single character")
})
# Default Test
TRBV_human_inframe_aa <- getIMGT(species = "human",
chain = "TRB",
frame = "inframe",
region = "v",
sequence.type = "aa")
saveRDS(TRBV_human_inframe_aa, "getIMGT_TRBV_human_inframe_aa.rds")
testthat::test_local()
devtools::document()
devtools::check()
devtools::document()
install.packages("protr")
protr::extractAtchleyFactor()
protr::extractStScales()
protr::extracttScales()
protr::extractAAC()
protr::extractAAC("AC")
protr_ns <- getNamespace("protr")
extract_funs <- ls(protr_ns, pattern = "^extract.*(Scales|Factor)$")
extract
extract_funs
extract_funs <- ls(protr_ns, pattern = "^extract$")
extract_funs <- ls(protr_ns, pattern = "^extract")
extract_funs
install.packages("Peptides")
y <- aaindex
library(seqinr)
y <- aaindex
data(aaindex)
View(aaindex)
install.packages("bio3d")
data(bio3d::aa.index, package = "bio3d", envir = environment())
bio3d::aa.index
data(bio3d::aa.index, package = "bio3d", envir = environment())
data(bio3d::aa.index, package = "bio3d", envir = environment())
data(aa.index, package = "bio3d", envir = environment())
avail <- c(avail, names(get("aa.index", envir = environment())))
avail <- names(built_in)
built_in <- list(Atchley = .Atchley, Kidera = .Kidera)
avail <- names(built_in)
## ------------------------------------------------------------------
## 0.  Built-in composite matrices (NO external packages) ----------
## ------------------------------------------------------------------
.Atchley <- matrix(c(
-0.59146, -1.34267,  1.05015,  1.35733, -1.00610, -0.38388,  0.33617,
-1.23936,  1.83147, -1.01895, -0.66313,  0.94536,  0.18863,  0.93057,
1.53755, -0.22788, -0.03182, -1.33661, -0.59534,  0.26000,
-1.30209,  0.46542,  0.30242, -1.45276, -0.59047,  1.65201, -0.41663,
-0.54652, -0.56110, -0.98693, -1.52354,  0.82846,  2.08084, -0.17927,
-0.05473,  1.39870,  0.32571, -0.27855,  0.00908,  0.82992,
-0.73307, -0.86203, -3.65591,  1.47666,  1.89097,  1.33010, -1.67337,
2.13143,  0.53322, -1.50462,  2.21948,  1.29913, -1.62833, -3.00487,
1.50211, -4.75964,  2.21346, -0.54401,  0.67193,  3.09736,
1.57039, -1.02008, -0.25902,  0.11294, -0.39662,  1.04498, -1.47389,
0.39316, -0.27711,  1.26583, -1.00472, -0.16882,  0.42070, -0.50259,
0.44032,  0.67017,  0.90790,  1.24199, -2.12752, -0.83802,
-0.14551, -0.25517, -3.24177, -0.83716,  0.41194,  2.06386, -0.07773,
0.81630,  1.64763, -0.91181,  1.21181,  0.93339, -1.39177, -1.85303,
2.89744, -2.64747,  1.31337, -1.26225, -0.18358,  1.51151
), nrow = 20, byrow = FALSE,
dimnames = list(canon,
c("F1.PAH","F2.PSS","F3.MS","F4.CC","F5.EC")))
.Kidera <- matrix(Peptides::AAdata$kideraFactors,
nrow = 20, byrow = FALSE,
dimnames = list(canon,
sprintf("KF%02d", 1:10)))
built_in <- list(Atchley = .Atchley, Kidera = .Kidera)
names(get("aa.index", envir = environment()))
avail <- names(get("aa.index", envir = environment()))
Peptides:::AAdata$kideraFactors
x <- Seurat::Read10X("~/Documents/test")
x <- Seurat::Read10X("~/Downloads/test")
x <- Seurat::Read10X("~/Downloads/test")
View(x)
