% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/prepare_data.R
\name{prepare_data}
\alias{prepare_data}
\alias{prepare_data.data.frame}
\alias{prepare_data.character}
\title{Prepare data for corpus exploration}
\usage{
prepare_data(dataset, ...)

\method{prepare_data}{data.frame}(
  dataset,
  date_based_corpus = TRUE,
  grouping_variable = NULL,
  within_group_identifier = "Seq",
  columns_doc_info = c("Date", "Title", "URL"),
  corpus_name = NULL,
  use_matrix = TRUE,
  matrix_without_punctuation = TRUE,
  tile_length_range = c(1, 10),
  columns_for_ui_checkboxes = NULL,
  ...
)

\method{prepare_data}{character}(
  dataset,
  corpus_name = NULL,
  use_matrix = TRUE,
  matrix_without_punctuation = TRUE,
  ...
)
}
\arguments{
\item{dataset}{Object to convert to corporaexplorerobject:
  \itemize{
  \item A data frame with a column "Text" (class
  character), and optionally other columns.
  If \code{date_based_corpus} is \code{TRUE} (the default),
  \code{dataset} must contain a column "Date" (of class Date).
  \item Or a non-empty character vector.
}}

\item{...}{Other arguments to be passed to \code{prepare_data}.}

\item{date_based_corpus}{Logical. Set to \code{FALSE} if the corpus
is not to be organised according to document dates.}

\item{grouping_variable}{Character string.
If \code{date_based_corpus} is \code{TRUE}, this argument is ignored.
If \code{date_based_corpus} is \code{FALSE}, this argument can be used
to group the documents, e.g. if \code{dataset} is organised by chapters
belonging to different books.}

\item{within_group_identifier}{Character string indicating column name in \code{dataset}.
\code{"Seq"}, the default, means the rows in each group are assigned
a numeric sequence 1:n where n is the number of rows in the group.
Used in document tab title in non-date based corpora.
If \code{date_based_corpus} is \code{TRUE}, this argument is ignored.}

\item{columns_doc_info}{Character vector. The columns from \code{dataset} to display in
the "document information" tab in the corpus exploration app. By default
"Date", "Title" and "URL" will be
displayed, if included. If \code{columns_doc_info} includes a column which is not
present in dataset, it will be ignored.}

\item{corpus_name}{Character string with name of corpus.}

\item{use_matrix}{Logical. Should the function create a document term matrix
for fast searching? If \code{TRUE}, data preparation will run longer and demand
more memory. If \code{FALSE}, the returning corporaexplorerobject will be more light-weight, but
searching will be slower.}

\item{matrix_without_punctuation}{Should punctuation and digits be stripped
  from the text before constructing the document term matrix? If \code{TRUE},
  the default:
\itemize{
    \item The corporaexplorer object will be lighter and most searches in
    the corpus exploration app will be faster.
    \item Searches including punctuation and digits will be carried out in
    the full text documents.
    \item The only "risk" with this strategy is that the corpus exploration
    app in some cases can produce false positives. E.g. searching for the
    term "donkey" will also find the term "don\%key".
This should not be a problem for the vast majority of use cases, but if
one so desires, there are three different solutions: set this parameter to
\code{FALSE}, create a corporaexplorerobject without a matrix by setting
the \code{use_matrix} parameter to \code{FALSE}, or run
\code{\link[corporaexplorer]{explore}} with the
\code{use_matrix} parameter set to \code{FALSE}.
}
 If \code{FALSE}, the corporaexplorer object will be larger, and most
 simple searches will be slower.}

\item{tile_length_range}{Numeric vector of length two.
Fine-tune the tile lengths in document wall
and day corpus view. Tile length is calculated by
\code{scales::rescale(nchar(dataset$Text),
to = tile_length_range,
from = c(0, max(.)))}
Default is \code{c(1, 10)}.}

\item{columns_for_ui_checkboxes}{Character. Character or factor column(s) in dataset.
Include sets of checkboxes in the app sidebar for
convenient filtering of corpus.
Typical useful for columns with a small set of unique
(and short) values.
Checkboxes will be arranged by \code{sort()},
unless \code{columns_for_ui_checkboxes}
is a vector of factors, in which case the order will be according to
factor level order (easy relevelling with \code{forcats::fct_relevel()}).
To use a different
label in the sidebar than the columnn name,
simply pass a named character vector to \code{columns_for_ui_checkboxes}.
If \code{columns_for_ui_checkboxes} includes a column which is not
present in dataset, it will be ignored.}
}
\value{
A \code{corporaexplorer} object to be passed as argument to
  \code{\link[corporaexplorer]{explore}} and
  \code{\link[corporaexplorer]{run_document_extractor}}.
}
\description{
Convert data frame or character vector to a ‘corporaexplorerobject’
  for subsequent exploration.
}
\details{
For data.frame: Each row in \code{dataset} is treated as a base differentiating unit in the corpus,
  typically chapters in books, or a single document in document collections.
  The following column names are reserved and cannot be used in \code{dataset}:
  "ID",
  "Text_original_case",
  "Tile_length",
  "Year",
  "Seq",
  "Weekday_n",
  "Day_without_docs",
  "Invisible_fake_date",
  "Tile_length".

A character vector will be converted to a simple corporaexplorerobject
  with no metadata.
}
\examples{
## From data.frame
# Constructing test data frame:
dates <- as.Date(paste(2011:2020, 1:10, 21:30, sep = "-"))
texts <- paste0(
  "This is a document about ", month.name[1:10], ". ",
  "This is not a document about ", rev(month.name[1:10]), "."
)
titles <- paste("Text", 1:10)
test_df <- tibble::tibble(Date = dates, Text = texts, Title = titles)

# Converting to corporaexplorerobject:
corpus <- prepare_data(test_df, corpus_name = "Test corpus")

if(interactive()){
# Running exploration app:
explore(corpus)

# Running app to extract documents:
run_document_extractor(corpus)
}

## From character vector
alphabet_corpus <- prepare_data(LETTERS)

if(interactive()){
# Running exploration app:
explore(alphabet_corpus)
}
}
