MiningTransparencyManuscript/deps.R

# =============================================================================
# Configuraion file
# =============================================================================
# This file contains necessary variables that will be used in the whole process
# and all necessary library imports.
# See the configuration section for theming, debug mode and so on.
# =============================================================================

# ---- Helper functions -------------------------------------------------------

install_if_missing <- function(pkgs) {
  to_install <- pkgs[!vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1))]
  if (length(to_install)) {
    message("Installing missing CRAN packages: ", paste(to_install, collapse = ", "))
    install.packages(to_install, repos = "https://cloud.r-project.org/")
  }
}

install_github_if_missing <- function(pkg, repo) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    if (!requireNamespace("remotes", quietly = TRUE)) {
      install.packages("remotes", repos = "https://cloud.r-project.org/")
    }
    message("Installing GitHub package ", pkg, " from ", repo)
    remotes::install_github(repo)
  }
}

# =============================================================================
# Install & Load
# =============================================================================

# Define all required packages with comments
pkgs <- c(
  # Data handling & plotting
  "tidyverse",    # data manipulation, ggplot2, etc.
  "patchwork",    # combine multiple plots
  "ggthemes",
  "ggpmisc",
  "scales",

  # Tables & summaries
  "summarytools",
  "gtsummary",
  "gt",
  "knitr",
  "kableExtra",
  "flextable", # docx tables

  # Misc helpers
  "rlang",
  "rmarkdown",
  "qs2",        #
  "presize",    # sample size calculations
  "survey"
)


# Install any that aren't already present
missing <- pkgs[!pkgs %in% installed.packages()[, "Package"]]
if (length(missing)) {
  install.packages(missing, repos = "https://cloud.r-project.org/")
}

# Install and load CRAN packages
install_if_missing(pkgs)
invisible(lapply(pkgs, library, character.only = TRUE))

# ---- GitHub packages ---------------------------------------------------

# Serosurvey
install_github_if_missing("serosurvey", "avallecam/serosurvey")
library(serosurvey)

# ggthemr
install_github_if_missing("ggthemr", "Mikata-Project/ggthemr")
library(ggthemr)

# =============================================================================
# Configuration
# =============================================================================

# ---- Output format ----------------------------------------------------------
# it is important to set the correct pandoc/quarto output format as knitr tables don't work in docx.
# possible formats:
# - docx
# - pdf/tex

# Read OUTPUT_FORMAT from env; default to "pdf/tex"
output_format <- Sys.getenv("OUTPUT_FORMAT", "pdf/tex")
# Background: some tables needs special treatment for docx rendering.

# ---- Debug Mode -------------------------------------------------------------
debug_mode <- TRUE
if (isTRUE(debug_mode)) debug_info <- list()

# ---- Theme & Colors ---------------------------------------------------------
# Theme
ggthemr('fresh')

# consitent colors for open science practices among plots
osp_cols <- c(
  "Preregistration" = "#ee5927",
  "Open Data" = "#321c3d",
  "Open Materials" = "#005c5c",
  "Open Access" = "#bf1869",
  "Statistical Inference" = "#f2a900"
)

name_mapping <- c(
  "Preregistration" = "is_prereg",
  "Open Data" = "is_open_data",
  "Open Materials" = "is_open_materials",
  "Open Access" = "is_open_access",
  "Statistical Inference" = "is_statistical_inference"
)

# map names & colors
osp_cols2 <- osp_cols
names(osp_cols2) <- name_mapping[names(osp_cols)]

# =============================================================================
# DIRECTORY PATHS
# =============================================================================

# Base directories
dir_data <- "data"
dir_output <- "output"
dir_logs <- "logs"
dir_models <- "models"
dir_code <- "code"
dir_docs <- "docs"
dir_renv <- "renv"
dir_freeze <- "_freeze"
dir_output_quarto <- "_output"  # Quarto's rendered output

# Data subdirectories
dir_data_meta <- file.path(dir_data, "meta")
dir_data_crossref <- file.path(dir_data_meta, "crossref")
dir_data_fulltext <- file.path(dir_data, "fulltext")
dir_data_keywords <- file.path(dir_data, "keyword_dicts")

# Output subdirectories
dir_output_plots <- file.path(dir_output, "plots")
dir_output_tables <- file.path(dir_output, "tables")

# =============================================================================
# Files
# =============================================================================

# Primary data files
file_journals <- file.path(dir_data_meta, "journals.qs2")
file_meta_combined <- file.path(dir_data_meta, "meta_combined.qs2")
file_meta_final <- file.path(dir_data_meta, "meta_final.qs2")
file_sample_final <- file.path(dir_data_meta, "sample_final.qs2")
file_sample_stat_final <- file.path(dir_data_meta, "sample_stat_final.qs2") # statistical inference classification sample
file_sample_analysis <- file.path(dir_data, "sample_analysis.qs2") # final sample, ready for analysis

# Additional data files (wasn't used yet)
file_dicts <- file.path(dir_data, "dicts.csv")

# Statistical inference sample files
file_train_stat <- file.path(dir_data, "train_stat.csv") # statistical inference classification training sample
file_train_stat_coded <- file.path(dir_data, "train_stat_coded.csv") # statistical inference classification hand coded training sample
file_train_stat_downloader_meta <- file.path(dir_data, "train_stat_downloader_meta_export.csv") # statistical inference downloader metadata (containing file_path)

file_train_stat_coded_hand <- file.path(dir_data, "train_stat_coded_hand.csv") # statistical inference classification hand coded
file_train_stat_coded_hand_corrected <- file.path(dir_data, "train_stat_coded_hand_corrected.csv") # statistical inference classification hand coded
file_train_stat_coded_hand_corrected_gpt <- file.path(dir_data, "train_stat_coded_hand_corrected.csv") # statistical inference classification hand coded

# OSP Classification files
file_sample <- file.path(dir_data, "full-sample.csv") # full sample of all years
file_sample_statistical <- file.path(dir_data, "sample_statistical.csv") # full sample of statistical inference papers
file_train <- file.path(dir_data, "train.csv") # training subset of statistical inference papers for osp classification
file_train_coded_hand <- file.path(dir_data, "train_coded_hand.csv") # training subset (osp), hand-coded
file_train_coded_hand_corrected <- file.path(dir_data, "train_coded_hand_corrected.csv") # training subset (osp), hand-coded, corrected
file_train_coded_hand_corrected_gpt <- file.path(dir_data, "train_coded_hand_corrected_gpt.csv") # training subset (osp), corrected, hand-coded + remaining cases coded with gpt
file_sample_downloader_meta <- file.path(dir_data, "train_downloader_meta_export.csv") # statistical inference classification hand coded

# Classified data
file_classified_stat <- file.path(dir_data, "classified_stat.csv") # statistical inference
file_classified_oa <- file.path(dir_data, "classified_oa.csv") # open access
file_classified_osp <- file.path(dir_data, "classified_osp.csv") # other osp's
file_classified_osp_probs <- file.path(dir_data, "classified_osp_probs.csv") # other osp's

file_sample_fully_classified <- file.path(dir_data, "full-sample_classified.csv") # combination of the above

# Downloader import CSV files
file_downloader_full <- file.path(dir_data, "download_full.csv")
file_downloader_sample <- file.path(dir_data, "download_sample.csv")

# Codebook
file_codebook <- file.path(dir_output, "codebook.docx")