% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/parseDoc.R
\name{parseDoc}
\alias{parseDoc}
\title{parse a document and place content in a DocSet}
\usage{
parseDoc(csv, DocSetInstance = new("DocSet"), doctitle = NA_character_,
  docabst = NA_character_, rec_id_field = "experiment.accession",
  exclude_fields = c("study.accession"),
  substrings_to_omit = c("http://purl.obolibrary.org/obo/"),
  patterns_to_kill = "....-..-..|.*...,...",
  token_fixups = list(c("t''", "t'"), c(":$", "")), max_tok_nchar = 25,
  min_tok_nchar = 4, cleanFields = list("..*id$", ".name$", "_name$",
  "checksum", "isolate", "filename", "^ID$", "barcode", "Sample.Name"))
}
\arguments{
\item{csv}{a character(1) CSV file path}

\item{DocSetInstance}{if missing, DocSet is initialized in this
function, otherwise the instance is updated with new content}

\item{doctitle}{character(1) document title}

\item{docabst}{character(1) a string: the document abstract}

\item{rec_id_field}{character(1) field in CSV identifying records}

\item{exclude_fields}{character vector of fields to ignore while parsing}

\item{substrings_to_omit}{character vector of strings to remove from candidate keywords via gsub}

\item{patterns_to_kill}{character(1) regexp that identifies tokens to be omitted from keyword set}

\item{token_fixups}{a list if character(2) vectors that will be}

\item{max_tok_nchar}{numeric(1) defaults to 25, tokens with more characters will be truncated to this length and suffixed with ellipsis}

\item{min_tok_nchar}{numeric(1) defaults to 4, tokens shorter than this are not in index
used with gsub() to repair irregularities.  For 
example `c("t''", "t'")` will transform `Burkitt''s` to `Burkitt's`}

\item{cleanFields}{list of regular expressions identifying fields to ignore}
}
\value{
instance of DocSet
}
\description{
parse a document and place content in a DocSet
}
\note{
The expected use case has `DocSetInstance` being updated in a loop.
Sharing of environments across multiple DocSetInstances can occur and unexpected
behaviors may ensue.  Note also that many of the parameter defaults to parseDoc are
for the use case of processing SRA metadata.
}
\examples{
myob = ssrch::docset_cancer68
td = tempdir()
alld = ls(docs2kw(myob))
r1 = retrieve_doc(alld[1], myob)
expo = write.csv(r1, paste0(td, "/expo.csv"))
pd = parseDoc(paste0(td, "/expo.csv"), doctitle=ssrch::titles68[alld[1]],
    docabst="qwerty")
pd
searchDocs("quer", pd) # query will fail
searchDocs("qwer", pd) # should succeed
}
