% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/make_conv.R
\name{make_conv}
\alias{make_conv}
\title{Create reference genome conversion table}
\usage{
make_conv(
  reference_list = NULL,
  ref_path_A = NULL,
  ref_path_B = NULL,
  ref_path_C = NULL,
  output = "tibble",
  skip_after = NULL
)
}
\arguments{
\item{reference_list}{List containing 2-3 file paths (character strings) to
references in fasta format. The names of the list items will be imported
into the final conversion table. The function may also take each reference
path as single character strings (ref_path_A, ref_path_B, ref_path_C), but
then any user-defined reference name will be lost (see example below).}

\item{ref_path_A}{Character string with path to 1st fasta reference (leading)}

\item{ref_path_B}{Character string with path to 2nd fasta reference}

\item{ref_path_C}{Character string with path to 3rd fasta reference}

\item{output}{Character indicating what type of output to provide. As default
output="tibble" will result in a table in the tibble format as described in
tibble/tidyverse packages. Anything else will result in a data frame.}

\item{skip_after}{Character or named list with character strings. If a
character string, only the part of the seqnames prior to that string will
be returned. Thus, if skip_after=" ", seqnames will be trimmed after first
white space. If a list of character strings, then each string will will be
applied to each provided reference. Thus, if skip_after=list(ensembl=" ",
ucsc="," , ncbi=";"), then seqnames will be trimmed at white space for the
reference named "ensembl", and at "," for the reference named "ucsc", etc.
If skip_after=NULL then the full seqnames are returned.}
}
\value{
A name conversion table either as a tibble or a data frame (see
  output)
}
\description{
Generates a table that can be used for converting seqnames across referens
genomes.
}
\details{
Given the file paths to 2-3 fasta reference genomes, this function will
quickly match the the sequences between these reference genomes and return
seqnames of each matching sequence. Thus, given the paths to the fasta for
the Ensembl, UCSC and NCBI assemblies of a given genome version (e.g. hg38 or
dm6), will result in a name conversion table between these databases. 

Only perfect matches or no matches will be reported. Thus, in case an entire
sequence is missing (e.g. a sex chromosomes), a table will be returned with a
warning, but if a sequence is partly missing (e.g. only half a chromosome),
then an error is returned. 

Sequence matching is done using md5 hashes, which dramatically increases the
speed for perfect matches. Matching will always be done from reference A
against the other references. Thus, if reference A (=1st reference in
reference_list) contains less sequences than the other references, only the
reference A sequences will be reported in the output, having the same order
as in reference A.
}
\examples{

## Only for testing:
fasta_path <- system.file("extdata/trna", "tRNA.fa", 
                          package = "seqpac", mustWork = TRUE) 
ref1 <- Biostrings::readDNAStringSet(fasta_path) 
ref1 <- ref1[1:295]
sqnames <- do.call("rbind",(strsplit(names(ref1), "\\\\d chr")))[,2]
names(ref1) <- do.call("rbind",(strsplit(sqnames, " \\\\(")))[,1]
logi_dup <- duplicated(do.call("rbind", strsplit(names(ref1),"\\\\:"))[,1])
ref1 <- ref1[!logi_dup]
ref2 <- ref1
names(ref2) <- paste0("chr", names(ref2))
# Save new reference in temporary folder 
if(grepl("windows", .Platform$OS.type)){
  tmpdr <- paste0(tempdir(), "\\\\seqpac")
}else{
  tmpdr <- paste0(tempdir(), "/seqpac")}
dir.create(tmpdr, showWarnings=FALSE) 
ref_path1 <- paste0(tmpdr, "/ref1.fa")
ref_path2 <- paste0(tmpdr, "/ref2.fa") 
Biostrings::writeXStringSet(ref1, filepath=ref_path1, format="fasta")
Biostrings::writeXStringSet(ref2, filepath=ref_path2, format="fasta")
ref_list <- list(ensembl=ref_path1, ucsc=ref_path2)
conv_table <- make_conv(reference_list=ref_list)
conv_table
              

## The principles:
#
#ref_path_A <- "/some/path/to/ensembl.fa"
#ref_path_B <- "/some/path/to/ucsc.fa"
#ref_path_C <- "/some/path/to/refseq.fa"
#
#ref_list <- list(ensembl=ref_path_A, UCSC=ref_path_B, NCBI=ref_path_C)
#
## Best (user defined names):
#conv_table <- make_conv(reference_list=ref_list) 
#
## But also (no names)
#conv_table <- make_conv(ref_path_A, ref_path_B, ref_path_C)
#conv_table <- make_conv(ref_path_A, ref_path_C)
#
#
## Make short names (skip everything after white space)
#conv_table <- make_conv(reference_list=reference_list, skip_after=" ") 

}
\seealso{
\url{https://github.com/OestLab/seqpac} for updates.

Other PAC reannotation: 
\code{\link{PAC_annotate}()},
\code{\link{add_reanno}()},
\code{\link{as.reanno}()},
\code{\link{import_reanno}()},
\code{\link{make_reanno}()},
\code{\link{map_reanno}()},
\code{\link{simplify_reanno}()}
}
\concept{PAC reannotation}
