moves stuff to supplements

2025-12-11 14:13:11 +01:00
parent 6c2d02b8e0
commit a2f42ee061
5 changed files with 595 additions and 472 deletions
@@ -1,21 +1,20 @@
-QUARTO = quarto
-QMD    = index.qmd
+QUARTO ?= quarto

+.PHONY: all pdf docx clean
+
+# Build both formats for both documents
 all: pdf docx

-# PDF
+# Aggregate targets
+pdf: index.pdf Supplements.pdf
+docx: index.docx Supplements.docx

-pdf: OUTPUT_FORMAT = pdf/tex
-pdf: index.pdf
+# Pattern rules for either format
+%.pdf: %.qmd
+	OUTPUT_FORMAT=pdf/tex $(QUARTO) render $< --to pdf

-index.pdf: $(QMD)
-	OUTPUT_FORMAT="$(OUTPUT_FORMAT)" $(QUARTO) render $(QMD) --to pdf
-
-# DOCX
-
-docx: OUTPUT_FORMAT = docx
-docx: index.docx
-
-index.docx: $(QMD)
-	OUTPUT_FORMAT="$(OUTPUT_FORMAT)" $(QUARTO) render $(QMD) --to docx
+%.docx: %.qmd
+	OUTPUT_FORMAT=docx $(QUARTO) render $< --to docx

+clean:
+	rm -f index.pdf Supplements.pdf index.docx Supplements.docx
@@ -0,0 +1,514 @@
+---
+title: "Mining Transparency - Supplementary Materials"
+top-level-division: section 
+prefer-html: true
+execute:
+  freeze: auto
+---
+
+```{r}
+#| label: setup
+#| include: false
+source("deps.R")
+```
+
+# Sample Size
+
+The sample size was determined by a precision-based calculation to ensure a $\pm$ 1.5 percentage point confidence interval for the SI prevalence as a precision-based sample size calculation was deemed more suitable for an exploratory prevalence study [@blandTyrannyPowerThere2009]. The calculations were based on prevalences arbitrarily estimated using the results of the literature review described in @sec-osp-in-crim.
+
+```{r}
+#| echo: false
+#| results: asis
+#| tbl-cap: Estimated Minimum Sample Size
+
+# worst-case prevalence and desired half-width
+p_max <- 0.50     # is_statistical prevalence ~50%
+d      <- 0.015    # +-1.5 percentage points, full CI width = 0.03
+
+# compute required total n for 95% CI at that precision
+result <- prec_prop(
+  p          = p_max,
+  conf.width = 2*d,
+  conf.level = 0.95,
+  method     = "agresti-coull"
+)
+
+n_total <- result$n
+
+table <- result %>% as.tibble() %>%
+  select(-padj) %>%
+  mutate(n = ceiling(n)) %>%
+  rename(
+    `Minimum Sample Size` = n,
+    `Confidence Interval Width` = conf.width,
+    `Confidence Level` = conf.level,
+  ) %>% 
+  mutate(
+    `Expected Prevalence` = paste0(p, " (", lwr ,", " , upr , ")")
+  ) %>%
+  select(-lwr,-upr,-p) %>%
+  t()
+
+if(output_format == "pdf/tex") {
+  table %>% kable()
+} else {
+  table %>% kable()
+}
+
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+```{r}
+#| echo: false
+#| results: asis
+#| label: tbl-cap-estimated-sample-sizes-osp
+#| tbl-cap: Estimated Minimum Sample Sizes - Open Science Practices
+
+expected_prev <- c(
+  `Open Access`    = 0.25,
+  `Open Data`      = 0.15,
+  `Open Materials` = 0.05,
+  `Preregistration`         = 0.05
+)
+
+required_ns <- sapply(expected_prev, function(p) {
+  res <- prec_prop(
+    p = p,
+    conf.width = 2*d,
+    conf.level = 0.95,
+    method = "agresti-coull"
+  )
+  res$n
+})
+
+summary_tbl <- tibble(
+  Category = names(required_ns),
+  `Required Sample Size` = required_ns
+)
+
+if(output_format == "pdf/tex") {
+  summary_tbl %>% kable(digits = 0)
+} else {
+  print("Table: Estimated Minimum Sample Sizes - Open Science Practices")
+}
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+The minimum calculated total sample size equals 4265 (rounded) publications to achieve a 95% confidence interval with a half-width of $\pm$ 1.5 pp using the @agrestiApproximateBetterExact1998 method. When applying the assumed prevalence values for each OSP, the required sample sizes to achieve a 95% confidence interval with a half-width of $\pm$ 1.5 pp vary substantially. As shown in @tbl-cap-estimated-sample-sizes-osp, approximately 3,200 publications are needed to estimate OA at 25%, about 2,180 publications for OD at 15%, and only about 840 publications for OM or Preregistration at 5%. 
+
+These values are all below the worst-case requirement of 4,264, reflecting the lower variance at prevalences farther from 50%. At the assumed prevalences, 2,182 SI papers would be required to estimate OD at 15% with +- 1.5 percentage-points precision. This equals the OD requirement but is below the OA requirement, which on the other hand can be measured for the whole population, not just SI publications. Thus, while the sample is sufficiently large for OD, OM, and Preregistration, it falls slightly short of the target precision for OA, which could be measured on a larger scale.
+
+```{r}
+#| echo: false
+#| results: asis
+#| label: tbl-cap-estimated-min-sample-sizes-osp
+#| tbl-cap: Expected 95% CI for Open Access
+
+n_total <- 2182
+p_exp   <- 0.25
+
+# CI estimation with Agresti-Coull, given n and p
+result <- prec_prop(
+  p          = p_exp,
+  n          = n_total,
+  conf.width = NULL, # ask for CI width
+  conf.level = 0.95,
+  method     = "agresti-coull"
+)
+
+table_sampl_est <- result %>% as.tibble() %>%
+  select(-padj) %>%
+  rename(
+    `Sample Size` = n,
+    `Confidence Interval Width` = conf.width,
+    `Confidence Level` = conf.level,
+  ) %>% 
+  mutate(
+    `Confidence Interval Width` = percent(`Confidence Interval Width`, accuracy = 0.01),
+    `Expected Prevalence` = paste0(p, " (", round(lwr,2) ,", " , round(upr,2) , ")")
+  ) %>%
+  select(-lwr,-upr,-p) %>%
+  t()
+
+if(output_format == "pdf/tex") {
+  table_sampl_est %>% kable(digits = 2)
+} else {
+  print("Table: Expected 95% CI for Open Access")
+}
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+An overestimation the prevalence of each OSP in the population can lead to potential problems with all following steps. The true prevalences and confidence intervals along with performance diagnostics of trained models were assessed after all classification tasks were processed. An estimation of the prevalences per year was not suitable as no detailed information about those proportions was available. Instead, the established approach to stratify the sample proportionally to the population was used [@larsenProportionalAllocationStrata2008]. 
+
+
+# Model Training
+
+For hyperparameter tuning and training of the ML models, the coded datasets were split into an training sample of 80% and a validation sample of 20%, stratified by the target variable as this improves training in scenarios with high class imbalance [@hilbertModelle2025]. K-Fold cross-validation was used during hyperparameter tuning to further iomprove model performance and reduce overfitting. 
+
+![Evaluation Metrics: Statistical Inference Classification](figures/combined_plot_is_statistical.pdf){#fig-evaluation-stat}
+
+The features differed in the feature construction: "TF" feature sets contained simple term frequencies of the keywords in each category whereas "n-gram" feature sets were constructed containing term frequencies of multi-word-phrases. Using ngrams has proven to enhance results in comparison to simple term frequencies in other contexts [e.g. @jandotInteractiveSemanticFeaturing2016; @ahmedDetectionOnlineFake2017], which is why I chose to include multi-gram (2 or 3 word phrases) feature sets as well as term-frequency and ngram combined feature sets in the evaluations. Multiple machine learning models were trained on those feature sets, resulting in multiple model-featureset combinations for each OSP assessed. An example of those combinations and the evaluation can be seen in @fig-jobs-osp.
+
+```{r}
+#| fig-height: 5
+#| fig-width: 10
+#| label: fig-jobs-osp
+#| fig-cap: Model, Feature and Variable Combinations
+#| fig-pos: h
+
+axis_mapping <- c(
+  "is_prereg" = "Preregistration",
+  "is_open_data" = "Open Data", 
+  "is_open_materials" = "Open Materials",
+  "is_open_access" = "Open Access"
+)
+
+jobsplot <- readRDS("figures/jobs_osp.rds") +
+  labs(
+    title = "",
+    subtitle = "",
+    x = "",
+    y = ""
+  )  + 
+  scale_fill_manual(
+    values = osp_cols2,
+    labels = axis_mapping
+  )
+
+print(jobsplot)
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+The two top-left graphs in @fig-evaluation-stat show the performance of different feature set and model combinations measured by ROC-AUC [@fawcettIntroductionROCAnalysis2006]. The top graph identifies the XGBoost classifier combined with a simple term frequencies dataset as the top-performing model. The top-right graph shows the most important terms for the XGBoost classifier, which are primarily statistical. The confusion matrix shows that the model is quite precise, with a 91.7% accuracy and a Cohen's Kappa of 0.832. This performance is good compared to hand-coded cases. Model calibration was not highly successful as the model's probabilities were already well-calibrated, mostly at the extremes of 0 and 1. A probability threshold of 0.25 was chosen based on three different metrics. This threshold is used for the final classification, where any case with a predicted probability greater than 0.25 is classified as 1. It's also important to note that the OSP classifiers performed much worse, as detailed in @sec-evaluation-metrics.
+
+```{r}
+#| fig-cap: Confusion Matrices - Manual vs ChatGPT Labels for Open Science Practices and Statistical Inference (design-weighted)
+#| label: fig-cfm-osp
+#| fig-height: 12
+#| fig-width: 11
+cfm_gpt_open_material_corrected <- readRDS("figures/cfm_gpt_open_material_corrected.rds")
+cfm_gpt_pre_registration_corrected <- readRDS("figures/cfm_gpt_pre_registration_corrected.rds")
+cfm_gpt_open_data_corrected <- readRDS("figures/cfm_gpt_open_data_corrected.rds")
+cfm_gpt_is_statistical_corrected <- readRDS("figures/cfm_gpt_is_statistical_corrected.rds") + labs(caption = paste0("n = 225"))
+
+plots <- c("cfm_gpt_open_material_corrected", 
+           "cfm_gpt_pre_registration_corrected", 
+           "cfm_gpt_open_data_corrected", 
+           "cfm_gpt_is_statistical_corrected")
+titles <- c("Open Materials", "Preregistration", "Open Data", "Statistical Inference")
+
+plotlist <- list()
+for (i in seq_along(plots)) {
+  plot <- get(plots[i]) +
+    labs(
+      title = titles[i],
+      ) +
+      ylab("ChatGPT") +
+      xlab("Manual")
+
+  plot <- plot + scale_fill_gradient(high = "white", low = osp_cols[titles[i]])
+  plotlist[[plots[i]]] <- plot
+}
+
+# combine plots using patchwork
+combined_plot <- wrap_plots(plotlist, ncol = 2) + # remove legend
+  plot_layout(guides = "collect") & theme(legend.position = "none")
+print(combined_plot)
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+$$
+\text{Accuracy} = \frac{TP + TN}{N} \quad \text{and} \quad
+\kappa = \frac{p_o - p_e}{1 - p_e}
+$$
+
+As expected, $\kappa$ is typically lower than Accuracy due to chance-agreement correction [@naiduReviewEvaluationMetrics2023].
+
+OM (@fig-plt-eval-om) tells a different story: despite nominal Accuracy of $94.3\%$, balanced accuracy drops to $60.0\%$ and $\kappa$ to $31.7\%$. Sensitivity is $20.0\%$ while specificity is $100.0\%$, yielding $F_1 = 33.3\%$. High nominal accuracy with a large miss rate indicates accuracy inflation under imbalance, and the p-value of $0.434$ confirms that accuracy does not exceed the no-information rate meaningfully.
+
+OD (@fig-plt-eval-od) sits between these extremes: accuracy $= 88.6\%$, balanced accuracy $= 93.7\%$, sensitivity $= 100.0\%$, specificity $= 87.3\%$. The classifier captures all positives but at the cost of eight false positives against seven true positives and 55 true negatives, which depresses precision and yields $F_1 = 63.6\%$. $\kappa = 57.9\%$ indicates moderate agreement beyond chance, and $p = 0.736$ again signals that nominal accuracy is uninformative under imbalance.
+
+In short, Preregistration appears comparatively reliable, OM is recall-limited, and OD is precision-limited. These profiles motivate reporting metrics suited to extreme class imbalance-Precision $P = \frac{TP}{TP+FP}$, Recall $R = \frac{TP}{TP+FN}$, balanced accuracy $BA = \frac{P+R}{2}$ - and anticipating how errors propagate into downstream estimates [@murphyMachineLearningProbabilistic2012; @fawcettIntroductionROCAnalysis2006].
+
+Category-specific results highlight class-imbalance constraints. Preregistration has only two positives in the validation sample, which makes any estimate imprecise, also resulting in a very undesirably large p-value of the accuracy-no-information-rate assumption[^1]. OM shows one false negative among six positives, and OD shows one false negative among eight positives. The SI classifier shows five false positives alongside one hundred twelve true positives and no false negatives, with all metrics indicating excellent performance.
+
+[^1]: The accuracy-no-information-rate p-value tests the null hypothesis that the accuracy is equal to the no-information rate or the accuracy when always predicting the most frequent class [@kuhnBuildingPredictiveModels2008].
+
+The ML classifiers trained on GPT labels inherit GPT's strengths and the data's sparsity.For the relatively small 20% validation set coded by GPT, the open-science practice classifiers are less precise and less reliable than the Statistical-Inference classifier. Preregistration (@fig-plt-eval-pr) appears strongest (balanced accuracy $= 99.2\%$, $F_1 = 88.9\%$, $\kappa = 88.1\%$), but the counts are sparse (four true positives, one false negative, no false positives), and the p-value versus the no-information rate ($p = 0.0853$) is not conventionally significant-an expected consequence of the very low base rate rather than a systematic error.
+
+
+# Tables: OSP Adoption Over Time Among Statistical Inference Papers
+
+## OSP Adoption Over Time Among Statistical Inference Papers {#sec-osp-adoption-tables}
+
+```{r}
+#| label: osp-adoption-tables
+#| tbl-caption: Open Data
+df <- qs_read(file_sample_analysis)
+population <- qs_read(file_meta_final)
+
+df <- df %>% mutate(published_year = as.integer(published_year))
+population <- population %>% mutate(published_year = as.integer(published_year))
+
+# Binary recodes for all targets 
+targets <- c("is_open_access","is_open_data","is_open_materials","is_prereg")
+
+df_bin <- df %>%
+  mutate(
+    across(
+      all_of(targets),
+      ~ ifelse(. == "Yes", 1, ifelse(. == "No", 0, NA_real_)),
+      .names = "{.col}_bin"
+    ))
+    
+# Frame totals by year (the ~40k post-keyword frame)
+pop_year <- population %>%
+  count(published_year, name = "Freq") %>%
+  arrange(published_year)
+
+# add counts
+df_bin <- df_bin %>%
+  left_join(pop_year %>% rename(N_y = Freq), by = "published_year")
+
+# Base design on all sampled records, stratified by year, finite population correction
+des0 <- svydesign(ids = ~1, strata = ~published_year, fpc = ~N_y, data = df_bin)
+des_ps <- postStratify(design = des0, strata = ~published_year, population = pop_year)
+
+# Make sure the *_bin fields are truly numeric 0/1 inside the design
+des_ps <- des_ps %>% update(
+  is_open_access_bin    = as.numeric(df_bin$is_open_access == "Yes"),
+  is_open_data_bin      = as.numeric(df_bin$is_open_data == "Yes"),
+  is_open_materials_bin = as.numeric(df_bin$is_open_materials == "Yes"),
+  is_prereg_bin         = as.numeric(df_bin$is_prereg == "Yes"),
+  is_statistical_bin    = as.numeric(df_bin$is_statistical == "Yes")
+)
+
+# restrict to statistical inference pubs at analysis time
+des_stat <- subset(des_ps, is_statistical_bin == 1)
+
+# This tells svyby to run svyciprop and also return the confidence interval
+ci_prop <- function(x, ...) {
+  # The formula is ~x because svyby passes the column itself
+  est <- svyciprop(~x, design = des_stat, method = "logit", na.rm = TRUE, ...)
+  ci <- confint(est)
+  # Return a named vector
+  c(prop = as.numeric(coef(est)), prop_low = ci[1], prop_upp = ci[2])
+}
+
+vars <- c(
+  "is_prereg_bin"         = "Preregistration",
+  "is_open_data_bin"      = "Open Data",
+  "is_open_materials_bin" = "Open Materials",
+  "is_open_access_bin"    = "Open Access"
+)
+
+# Loop through each variable, run svyby, and collect results in a list
+results_list <- lapply(names(vars), function(var_name) {
+  
+  # Create a formula for the specific variable, e.g., ~is_prereg_bin
+  form <- as.formula(paste0("~", var_name))
+  
+  # Run svyby for this single variable
+  # vartype = "ci" automatically calculates the confidence interval
+  res_by_year <- svyby(
+    formula = form,
+    by = ~published_year,
+    design = des_stat,
+    FUN = svyciprop,
+    method = "beta", # i'd use logit, but it always causes an error for this case. I was only able to solve this after one day of work: if using logit as planned, a warning about "observations with zero weight not used for calculating dispersion" appears, which indicates the failure of the iterative process to find the best estimates - which results in one value failing to be calculated (open access in year 2013). 
+    vartype = "ci",
+    na.rm = TRUE
+  )
+  
+  # Add a column with the "pretty" variable name (e.g., "Preregistration")
+  res_by_year$variable <- vars[var_name]
+  
+  # Rename the columns to match what ggplot expects
+  # The output columns are the variable name (e.g., is_prereg_bin), ci_l, and ci_u
+  colnames(res_by_year)[2] <- "prop" # The second column is always the proportion
+  
+  return(res_by_year)
+})
+
+# Combine the list of results into a single data frame
+yearly_long <- bind_rows(results_list) %>%
+  # Rename ci columns to prop_low and prop_upp for your ggplot code
+  rename(prop_low = ci_l, prop_upp = ci_u)
+
+vars <- c(
+  "is_prereg_bin" = "Preregistration",
+  "is_open_data_bin" = "Open Data",
+  "is_open_materials_bin" = "Open Materials",
+  "is_open_access_bin" = "Open Access"
+)
+
+legend_labs <- c(
+  "Preregistration"= "Preregistration",
+  "Open Data"= "Open Data",
+  "Open Materials" = "Open Materials",
+  "Open Access" = "Open Access"
+)
+
+p <- ggplot(yearly_long, aes(x = published_year, y = prop, color = variable)) +
+  geom_ribbon(
+    aes(
+      ymin = prop_low, 
+      ymax = prop_upp, 
+      fill = variable
+      ),
+    alpha = 0.10, 
+    color = NA
+    ) +
+  geom_line(linewidth = 1) +
+  geom_point(size = 1.5) +
+  scale_x_continuous(
+    breaks = pretty(unique(yearly_long$published_year), n = 13)) +
+  scale_y_continuous(
+    labels = scales::percent_format(accuracy = 1), limits = c(0, 0.65),
+    breaks = pretty(seq(0, 0.65, by = 0.05), n = 7)
+    ) +
+  scale_fill_manual(values = osp_cols, guide = "none") +
+  scale_color_manual(
+    values = osp_cols,
+    name = "",
+    labels = function(x) legend_labs[x]
+    ) +
+  labs(
+    x = "", y = "",
+    color = ""#,
+    #title = "OSP Adoption Over Time",
+    #subtitle = "Among statistical inference papers (design-weighted to frame-by-year totals)"
+  ) +
+  theme(legend.position = "bottom") +
+  guides(color = guide_legend(ncol = 4))
+
+tbl_osp_prev_overall_dsadj <- yearly_long
+
+# rename published_year prop prop_low prop_upp variable
+tbl_osp_prev_overall_dsadj_b <- tbl_osp_prev_overall_dsadj %>% 
+  select(published_year, prop, prop_low, prop_upp, variable) %>%
+  filter(variable == "Open Data") %>%
+  select(-variable) %>%
+  mutate(
+    prop = percent(prop, accuracy = 0.1),
+    prop_low = percent(prop_low, accuracy = 0.1),
+    prop_upp = percent(prop_upp, accuracy = 0.1),
+  ) %>%
+  rename(
+    `Year` = published_year,
+    `Proportion` = prop,
+    `.95 CI (Lower)` = prop_low,
+    `.95 CI (Upper)` = prop_upp
+  ) %>%
+  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Data") %>%
+  kable_styling(position = "center", full_width = FALSE)
+
+if(output_format == "pdf/tex") {
+  tbl_osp_prev_overall_dsadj_b
+} else {
+  print("Table: Open Data")
+}
+
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+```{=latex}
+\clearpage
+\newpage
+```
+
+```{r}
+# rename published_year prop prop_low prop_upp variable
+tbl_osp_prev_overall_dsadj_c <- tbl_osp_prev_overall_dsadj %>% 
+  select(published_year, prop, prop_low, prop_upp, variable) %>%
+  filter(variable == "Open Materials") %>%
+  select(-variable) %>%
+  mutate(
+    prop = percent(prop, accuracy = 0.1),
+    prop_low = percent(prop_low, accuracy = 0.1),
+    prop_upp = percent(prop_upp, accuracy = 0.1),
+  ) %>%
+  rename(
+    `Year` = published_year,
+    `Proportion` = prop,
+    `.95 CI (Lower)` = prop_low,
+    `.95 CI (Upper)` = prop_upp
+  ) %>%
+  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Materials") %>%
+  kable_styling(position = "center", full_width = FALSE)
+
+if(output_format == "pdf/tex") {
+  tbl_osp_prev_overall_dsadj_c
+} else {
+  print("Table: Open Materials")
+}
+
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+```{r}
+# rename published_year prop prop_low prop_upp variable
+tbl_osp_prev_overall_dsadj_d <- tbl_osp_prev_overall_dsadj %>% 
+  select(published_year, prop, prop_low, prop_upp, variable) %>%
+  filter(variable == "Open Access") %>%
+  select(-variable) %>%
+  mutate(
+    prop = percent(prop, accuracy = 0.1),
+    prop_low = percent(prop_low, accuracy = 0.1),
+    prop_upp = percent(prop_upp, accuracy = 0.1),
+  ) %>%
+  rename(
+    `Year` = published_year,
+    `Proportion` = prop,
+    `.95 CI (Lower)` = prop_low,
+    `.95 CI (Upper)` = prop_upp
+  ) %>%
+  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Access") %>%
+  kable_styling(position = "center", full_width = FALSE)
+
+if(output_format == "pdf/tex") {
+  tbl_osp_prev_overall_dsadj_d
+} else {
+  print("Table: Open Access")
+}
+
+if (isTRUE(debug_mode)) {
+  debug_info[[knitr::opts_current$get("label")]] <- 
+    if (knitr::is_html_output()) "HTML" else "LaTeX"
+}
+```
+
+```{=latex}
+\clearpage
+```
+
+# Evaluation Metrics {#sec-evaluation-metrics}
+
+![Evaluation Metrics: Open Data](figures/combined_plot_is_open_data.pdf){#fig-plt-eval-od fig-pos=H}
+
+![Evaluation Metrics: Open Data](figures/combined_plot_is_open_data.pdf){#fig-plt-eval-od fig-pos=H}
+
+![Evaluation Metrics: Open Materials](figures/combined_plot_is_open_materials.pdf){#fig-plt-eval-om fig-pos=H}
+
+![Evaluation Metrics: Preregistration](figures/combined_plot_is_prereg.pdf){#fig-plt-eval-pr fig-pos=H}
@@ -17,15 +17,19 @@ format:
    pdf-engine: xelatex
    keep-tex: true
    latex-max-runs: 3
+    toc: true
+    toc-depth: 3
+    lot: false
+    lof: false
  docx:
    prefer-html: true
+    toc: true
+    toc-depth: 3
+    lot: false
+    lof: false

 always_allow_html: true

-toc: true
-toc-depth: 3
-lot: false
-lof: false
 number-sections: true

 citeproc: true
@@ -3,6 +3,7 @@
 # =============================================================================
 # This file contains necessary variables that will be used in the whole process
 # and all necessary library imports.
+# See the configuration section for theming, debug mode and so on.
 # =============================================================================

 # ---- Helper functions -------------------------------------------------------
@@ -64,7 +65,7 @@ if (length(missing)) {
 install_if_missing(pkgs)
 invisible(lapply(pkgs, library, character.only = TRUE))

-# ---- GitHub packages --------------------------------------------------------
+# ---- GitHub packages ---------------------------------------------------

 # Serosurvey
 install_github_if_missing("serosurvey", "avallecam/serosurvey")
@@ -74,6 +75,49 @@ library(serosurvey)
 install_github_if_missing("ggthemr", "Mikata-Project/ggthemr")
 library(ggthemr)

+# =============================================================================
+# Configuration
+# =============================================================================
+
+# ---- Output format ----------------------------------------------------------
+# it is important to set the correct pandoc/quarto output format as knitr tables don't work in docx.
+# possible formats:
+# - docx
+# - pdf/tex
+
+# Read OUTPUT_FORMAT from env; default to "pdf/tex"
+output_format <- Sys.getenv("OUTPUT_FORMAT", "pdf/tex")
+# Background: some tables needs special treatment for docx rendering.
+
+# ---- Debug Mode -------------------------------------------------------------
+debug_mode <- TRUE
+if (isTRUE(debug_mode)) debug_info <- list()
+
+# ---- Theme & Colors ---------------------------------------------------------
+# Theme
+ggthemr('fresh')
+
+# consitent colors for open science practices among plots
+osp_cols <- c(
+  "Preregistration" = "#ee5927",
+  "Open Data" = "#321c3d",
+  "Open Materials" = "#005c5c",
+  "Open Access" = "#bf1869",
+  "Statistical Inference" = "#f2a900"
+)
+
+name_mapping <- c(
+  "Preregistration" = "is_prereg",
+  "Open Data" = "is_open_data", 
+  "Open Materials" = "is_open_materials",
+  "Open Access" = "is_open_access",
+  "Statistical Inference" = "is_statistical_inference"
+)
+
+# map names & colors
+osp_cols2 <- osp_cols
+names(osp_cols2) <- name_mapping[names(osp_cols)]
+
 # =============================================================================
 # DIRECTORY PATHS
 # =============================================================================
@@ -18,43 +18,6 @@ execute:
 #| label: setup
 #| include: false
 source("deps.R")
-
-# Output format:
-# it is important to set the correct pandoc/quarto output format as knitr tables don't work in docx.
-# possible formats:
-# - docx
-# - pdf/tex
-
-# Read OUTPUT_FORMAT from env; default to "pdf/tex"
-output_format <- Sys.getenv("OUTPUT_FORMAT", "pdf/tex")
-# Background: some tables needs special treatment for docx rendering.
-
-# Debug Mode
-debug_mode <- TRUE
-if (isTRUE(debug_mode)) debug_info <- list()
-
-# Theme
-ggthemr('fresh')
-
-# consitent colors for open science practices among plots
-osp_cols <- c(
-  "Preregistration" = "#ee5927",
-  "Open Data" = "#321c3d",
-  "Open Materials" = "#005c5c",
-  "Open Access" = "#bf1869",
-  "Statistical Inference" = "#f2a900"
-)
-
-name_mapping <- c(
-  "Preregistration" = "is_prereg",
-  "Open Data" = "is_open_data", 
-  "Open Materials" = "is_open_materials",
-  "Open Access" = "is_open_access",
-  "Statistical Inference" = "is_statistical_inference"
-)
-
-osp_cols2 <- osp_cols
-names(osp_cols2) <- name_mapping[names(osp_cols)]
 ```

 # Introduction
@@ -177,9 +140,9 @@ The aim of this methodological work is to compile a sample of publications in th

 Full-text data for training the machine learning classification models will be collected with a web application developed specifically for this project. Since software development is not the focus of this work, details of the app's architecture will not be discussed here. A brief description of the application, along with screenshots, is provided in @sec-data-fulltext-collection.

-As a master's thesis, this study is necessarily scoped by time and resources. It shall therefore be treated as a pilot that establishes data, measures and a reproducible, yet improvable pipeline to be extended in to a fully exhaustive study. Where dnecessary, potential improvements that could not be implemented are recommended.
+This work is necessarily scoped by time and resources. It shall therefore be treated as a pilot that establishes data, measures and a reproducible, yet improvable pipeline to be extended in to a fully exhaustive study. Where necessary, potential improvements that could not be implemented are recommended.

-All data and code necessary to enable full replication can be retrieved from the osf repositories. A full description of used software and methods is further layed out within the replication files and the accompanying methodological report.
+All data and code necessary to enable full replication can be retrieved from the osf repositories found in @sec-supplements-downloader. A full description of used software and methods is further layed out within the replication files and the accompanying methodological report.

 ## Population

@@ -195,144 +158,11 @@ In summary, the study population consists of all statistical-inference publicati

 The sampling procedure involved drawing a large enough sample for the training using sequential sampling, in this specific context called active learning [@chickSequentialSamplingEconomics2012]. Faced with expected challenges in full-text acquisition, a rather demanding training pipeline, and unexpected low anticipated OSP prevalence, the sequential sampling approach was abandoned and an alternative approach was established.

-The sample size was determined by a precision-based calculation to ensure a $\pm$ 1.5 percentage point confidence interval for the SI prevalence as a precision-based sample size calculation was deemed more suitable for an exploratory prevalence study [@blandTyrannyPowerThere2009]. The calculations were based on prevalences arbitrarily estimated using the results of the literature review described in @sec-osp-in-crim.
+The sample size was determined by a precision-based calculation to ensure a $\pm$ 1.5 percentage point confidence interval for the SI prevalence as a precision-based sample size calculation was deemed more suitable for an exploratory prevalence study [@blandTyrannyPowerThere2009]. Calculations were based on prevalences arbitrarily estimated using the results of the literature review described in @sec-osp-in-crim, explained further in the provided supplements. A minimum calculated total sample size equaled $\aprox$4265 publications to achieve a 95% confidence interval with a half-width of $\pm$ 1.5 pp using the @agrestiApproximateBetterExact1998 method.

-First, Sample A, a random sample of up around 500 publications was manually classified to train the initial SI classifier. This step also helped estimate the effort for subsequent tasks. Next, an independent Sample B was drawn, stratified by year, thereby addressing problems in cross-validation and the non-independence of residuals assumptions of many machine-learning models [@varmaBiasErrorEstimation2006; @kohaviStudyCrossvalidationBootstrap1995; @robertsCrossvalidationStrategiesData2017]. 
+First, Sample A, a random sample of up around 500 publications was manually classified to train the initial SI classifier. This step also helped estimate the effort for subsequent tasks. Next, an independent Sample B was drawn, stratified by year, thereby addressing problems in cross-validation and the non-independence of residuals assumptions of many machine-learning models [@robertsCrossvalidationStrategiesData2017]. 

-```{r}
-#| echo: false
-#| results: asis
-#| tbl-cap: Estimated Minimum Sample Size
-
-# worst-case prevalence and desired half-width
-p_max <- 0.50     # is_statistical prevalence ~50%
-d      <- 0.015    # +-1.5 percentage points, full CI width = 0.03
-
-# compute required total n for 95% CI at that precision
-result <- prec_prop(
-  p          = p_max,
-  conf.width = 2*d,
-  conf.level = 0.95,
-  method     = "agresti-coull"
-)
-
-n_total <- result$n
-
-table <- result %>% as.tibble() %>%
-  select(-padj) %>%
-  mutate(n = ceiling(n)) %>%
-  rename(
-    `Minimum Sample Size` = n,
-    `Confidence Interval Width` = conf.width,
-    `Confidence Level` = conf.level,
-  ) %>% 
-  mutate(
-    `Expected Prevalence` = paste0(p, " (", lwr ,", " , upr , ")")
-  ) %>%
-  select(-lwr,-upr,-p) %>%
-  t()
-
-if(output_format == "pdf/tex") {
-  table %>% kable()
-} else {
-  print("Table: Estimated Minimum Sample Size")
-}
-
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-The SI classifier, trained on Sample A, was then used to analyze and classify all publications in Sample B. From the identified SI papers in Sample B, a balanced dataset was randomly sampled to create a training set for the OSP classifiers. Finally, these trained OSP classifiers were applied to the entire analytical Sample B. While a publisher or journal-based stratification for the full sample would have been ideal, it was not feasible due to the limited number of available full texts.
-
-```{r}
-#| echo: false
-#| results: asis
-#| label: tbl-cap-estimated-sample-sizes-osp
-#| tbl-cap: Estimated Minimum Sample Sizes - Open Science Practices
-
-expected_prev <- c(
-  `Open Access`    = 0.25,
-  `Open Data`      = 0.15,
-  `Open Materials` = 0.05,
-  `Preregistration`         = 0.05
-)
-
-required_ns <- sapply(expected_prev, function(p) {
-  res <- prec_prop(
-    p = p,
-    conf.width = 2*d,
-    conf.level = 0.95,
-    method = "agresti-coull"
-  )
-  res$n
-})
-
-summary_tbl <- tibble(
-  Category = names(required_ns),
-  `Required Sample Size` = required_ns
-)
-
-if(output_format == "pdf/tex") {
-  summary_tbl %>% kable(digits = 0)
-} else {
-  print("Table: Estimated Minimum Sample Sizes - Open Science Practices")
-}
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-The minimum calculated total sample size equals 4265 (rounded) publications to achieve a 95% confidence interval with a half-width of $\pm$ 1.5 pp using the @agrestiApproximateBetterExact1998 method. When applying the assumed prevalence values for each OSP, the required sample sizes to achieve a 95% confidence interval with a half-width of $\pm$ 1.5 pp vary substantially. As shown in @tbl-cap-estimated-sample-sizes-osp, approximately 3,200 publications are needed to estimate OA at 25%, about 2,180 publications for OD at 15%, and only about 840 publications for OM or Preregistration at 5%. 
-
-These values are all below the worst-case requirement of 4,264, reflecting the lower variance at prevalences farther from 50%. At the assumed prevalences, 2,182 SI papers would be required to estimate OD at 15% with +- 1.5 percentage-points precision. This equals the OD requirement but is below the OA requirement, which on the other hand can be measured for the whole population, not just SI publications. Thus, while the sample is sufficiently large for OD, OM, and Preregistration, it falls slightly short of the target precision for OA, which could be measured on a larger scale.
-
-```{r}
-#| echo: false
-#| results: asis
-#| label: tbl-cap-estimated-min-sample-sizes-osp
-#| tbl-cap: Expected 95% CI for Open Access
-
-n_total <- 2182
-p_exp   <- 0.25
-
-# CI estimation with Agresti-Coull, given n and p
-result <- prec_prop(
-  p          = p_exp,
-  n          = n_total,
-  conf.width = NULL, # ask for CI width
-  conf.level = 0.95,
-  method     = "agresti-coull"
-)
-
-table_sampl_est <- result %>% as.tibble() %>%
-  select(-padj) %>%
-  rename(
-    `Sample Size` = n,
-    `Confidence Interval Width` = conf.width,
-    `Confidence Level` = conf.level,
-  ) %>% 
-  mutate(
-    `Confidence Interval Width` = percent(`Confidence Interval Width`, accuracy = 0.01),
-    `Expected Prevalence` = paste0(p, " (", round(lwr,2) ,", " , round(upr,2) , ")")
-  ) %>%
-  select(-lwr,-upr,-p) %>%
-  t()
-
-if(output_format == "pdf/tex") {
-  table_sampl_est %>% kable(digits = 2)
-} else {
-  print("Table: Expected 95% CI for Open Access")
-}
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-An overestimation the prevalence of each OSP in the population can lead to potential problems with all following steps. The true prevalences and confidence intervals along with performance diagnostics of trained models were assessed after all classification tasks were processed. An estimation of the prevalences per year was not suitable as no detailed information about those proportions was available. Instead, the established approach to stratify the sample proportionally to the population was used [@larsenProportionalAllocationStrata2008]. 
+The SI classifier was then used to analyze and classify all publications in Sample B. From the identified SI papers in Sample B, a balanced dataset was randomly sampled to create a training set for the OSP classifiers. Finally, these trained OSP classifiers were applied to the entire analytical Sample B. While a publisher or journal-based stratification for the full sample would have been ideal, it was not feasible due to the limited number of available full texts.

 ## Data Collection {#sec-data-fulltext-collection}

@@ -382,7 +212,7 @@ if(output_format == "pdf/tex") {
    format   = "latex", # force LaTeX output (not markdown)
    booktabs = TRUE,
    longtable = FALSE, # avoid longtable entirely
-    col.names = c("Step #", "Step", "Before", "After", "Dropped"))
+    col.names = c("Step #", "Description", "Before", "After", "Dropped"))
 } else {
  print("Table: Cases Dropped from all Publications Obtained")
 }
@@ -516,7 +346,7 @@ if (isTRUE(debug_mode)) {

 The initial approach to gathering full texts, which used Zotero to translate DOIs as per Scoggins and Robertson, was unreliable across multiple attempts and versions. Due to the unsuitability of existing software tools-either for technical or legal reasons-a custom web application was developed.

-Legal aspects were carefully considered throughout the development. Within the EU, scraping is legal for scientific purposes [@urhg-60d-tdm], but institutional contracts can override this. Scraping was therefore limited to the university network and only to publishers that permit it while other publishers were scraped outside of the network. Technical details are available in the documents provided alongside the scraper that is available in the OSF repository.
+Legal aspects were carefully considered throughout the development. Within the EU, scraping is legal for scientific purposes [@urhg-60d-tdm], but institutional contracts can override this. Scraping was therefore limited to the university network and only to publishers that permit it while other publishers were scraped outside of the network. Technical details are available in the documents provided while the scraper might be made publicly available in the future.

 Downloading the analytical sample was mostly successful, though some publisher protections caused dropouts. Due to time constraints, additional runs were not feasible. Documents under 1,000 words were considered non-full-text papers. However, shorter HTML texts were retained for potential keyword matching. Text quality assessment (Flesch-Index) and word count identified missing full texts [@benoitQuantedaPackageQuantitative2018], with further analysis available in the methodological report. Full texts were downloaded for Independent Sample A and the Analytical Sample from which Sample B was drawn. The resulting dropouts should have been implicitly handled by post-stratification. Publisher-level weighting was considered but infeasible due to sparse cells that would have produced unstable weights. Post-stratification was conducted by year only, which does not correct publisher- or journal-specific dropouts. Future iterations should add publisher-level adjustment.

@@ -562,13 +392,13 @@ if (isTRUE(debug_mode)) {

 ## Classification Methods

-This section will present a summary of all methods used to classify the variables of interest. A thorough discussion of the decisions taken, the full descriptions and specifications of the models used as well as the preprocessing steps can be found in the methodology report.
+This section will present a summary of all methods used to classify the variables of interest. A thorough discussion of the decisions taken, the full descriptions and specifications of the models used as well as the preprocessing steps can be found in the methodology report. A brief description of the results can be found in the supplementary material.

 Since most existing classification approaches considered were deemed unsuitable for this scope (e.g., @kimResearchPaperClassification2019; @sanguansatFeatureMatricizationDocument2012; @jandotInteractiveSemanticFeaturing2016), this work instead relies on machine learning models trained on a manually and LLM coded subset of publications as LLMs have shown good performance on similar classification tasks [@buntValidatingUseLarge2025; @zhaoAdvancingSingleMultitask2024]. The classification of SI papers followed a staged approach. First a strict operationalization of "SI" (1) versus "not SI" (0), as well as of the OSPs with the same levels was created which was documented in a short coding manual. The process involved in the following steps:

 1. A small subset of papers from Sample A was hand-coded by the author according to the operationalization.
 2. ChatGPT classified both the hand-coded as well as the not coded publications in Sample A.
-3. A random subsample of 50 papers was coded both manually and with ChatGPT. Disagreements were carefully reviewed and manual coding was reassessed. Agreement after correction was very high ( $\kappa$ = 83,2%), with ChatGPT outperforming the author's initial coding consistency (see @fig-cfm-osp and materials for a more thorough discussion).
+3. A random subsample of 50 papers was coded both manually and with ChatGPT. Disagreements were carefully reviewed and manual coding was reassessed. Agreement after correction was very high ( $\kappa$ = 83,2%), with ChatGPT outperforming the author's initial coding consistency.
 4. Due to good performance, ChatGPT was used to classify the rest of Sample A, and the combined manual/LLM labels formed the training and test data for subsequent ML models.
 5. ML Classifiers were trained on the produced classified subsample.

@@ -576,86 +406,6 @@ Classification of the training Sample B followed the same approach. For classifi

 The approach might seem overly complicated but was intitially designed to be used on a much larger corpus of publications. As time progressed during the project multiple reasons recommend a simpler approach that will be discussed later. 

-```{r}
-#| fig-cap: Confusion Matrices - Manual vs ChatGPT Labels for Open Science Practices and Statistical Inference (design-weighted)
-#| label: fig-cfm-osp
-#| fig-height: 12
-#| fig-width: 11
-cfm_gpt_open_material_corrected <- readRDS("figures/cfm_gpt_open_material_corrected.rds")
-cfm_gpt_pre_registration_corrected <- readRDS("figures/cfm_gpt_pre_registration_corrected.rds")
-cfm_gpt_open_data_corrected <- readRDS("figures/cfm_gpt_open_data_corrected.rds")
-cfm_gpt_is_statistical_corrected <- readRDS("figures/cfm_gpt_is_statistical_corrected.rds") + labs(caption = paste0("n = 225"))
-
-plots <- c("cfm_gpt_open_material_corrected", 
-           "cfm_gpt_pre_registration_corrected", 
-           "cfm_gpt_open_data_corrected", 
-           "cfm_gpt_is_statistical_corrected")
-titles <- c("Open Materials", "Preregistration", "Open Data", "Statistical Inference")
-
-plotlist <- list()
-for (i in seq_along(plots)) {
-  plot <- get(plots[i]) +
-    labs(
-      title = titles[i],
-      ) +
-      ylab("ChatGPT") +
-      xlab("Manual")
-
-  plot <- plot + scale_fill_gradient(high = "white", low = osp_cols[titles[i]])
-  plotlist[[plots[i]]] <- plot
-}
-
-# combine plots using patchwork
-combined_plot <- wrap_plots(plotlist, ncol = 2) + # remove legend
-  plot_layout(guides = "collect") & theme(legend.position = "none")
-print(combined_plot)
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-For hyperparameter tuning and training of the ML models, the coded datasets were split into an training sample of 80% and a validation sample of 20%, stratified by the target variable as this improves training in scenarios with high class imbalance [@hilbertModelle2025]. K-Fold cross-validation was used during hyperparameter tuning to further iomprove model performance and reduce overfitting. 
-
-![Evaluation Metrics: Statistical Inference Classification](figures/combined_plot_is_statistical.pdf){#fig-evaluation-stat}
-
-The features differed in the feature construction: "TF" feature sets contained simple term frequencies of the keywords in each category whereas "n-gram" feature sets were constructed containing term frequencies of multi-word-phrases. Using ngrams has proven to enhance results in comparison to simple term frequencies in other contexts [e.g. @jandotInteractiveSemanticFeaturing2016; @ahmedDetectionOnlineFake2017], which is why I chose to include multi-gram (2 or 3 word phrases) feature sets as well as term-frequency and ngram combined feature sets in the evaluations. Multiple machine learning models were trained on those feature sets, resulting in multiple model-featureset combinations for each OSP assessed. An example of those combinations and the evaluation can be seen in @fig-jobs-osp.
-
-```{r}
-#| fig-height: 5
-#| fig-width: 10
-#| label: fig-jobs-osp
-#| fig-cap: Model, Feature and Variable Combinations
-#| fig-pos: h
-
-axis_mapping <- c(
-  "is_prereg" = "Preregistration",
-  "is_open_data" = "Open Data", 
-  "is_open_materials" = "Open Materials",
-  "is_open_access" = "Open Access"
-)
-
-jobsplot <- readRDS("figures/jobs_osp.rds") +
-  labs(
-    title = "",
-    subtitle = "",
-    x = "",
-    y = ""
-  )  + 
-  scale_fill_manual(
-    values = osp_cols2,
-    labels = axis_mapping
-  )
-
-print(jobsplot)
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-The two top-left graphs in @fig-evaluation-stat show the performance of different feature set and model combinations measured by ROC-AUC [@fawcettIntroductionROCAnalysis2006]. The top graph identifies the XGBoost classifier combined with a simple term frequencies dataset as the top-performing model. The top-right graph shows the most important terms for the XGBoost classifier, which are primarily statistical. The confusion matrix shows that the model is quite precise, with a 91.7% accuracy and a Cohen's Kappa of 0.832. This performance is good compared to hand-coded cases. Model calibration was not highly successful as the model's probabilities were already well-calibrated, mostly at the extremes of 0 and 1. A probability threshold of 0.25 was chosen based on three different metrics. This threshold is used for the final classification, where any case with a predicted probability greater than 0.25 is classified as 1. It's also important to note that the OSP classifiers performed much worse, as detailed in @sec-evaluation-metrics.
-
 Due to time constraints and the study's pilot nature, classification evaluation and data preprocessing were only optimized for the OSP classifier, not for the SI classifier. The more thorough approach used for OSP, which addressed challenges like high computational demands and class imbalance, would have improved the SI classifier but was not feasible. Despite this, the SI classifier still performed satisfactorily, and the optimal methods are reflected in the OSP training process. Furthermore, journal-level adoption of OSPs was originally intended to be assessed using the Transparency and Openness Promotion Factor [@nosekPromotingOpenResearch2015]. However, as the available sample sizes were insufficient for journal-level analyses, these were not carried out.

 ## Analysis
@@ -668,21 +418,13 @@ Data is reported per year. As per year data given the very low prevalences is ex

 # Results & Discussion

-The research design was deliberately designed to study open-science practices via supervised classifiers rather than relying exclusively on metadata. This choice prioritized scalability and the potential to capture practice signals that metadata may miss, at the cost of managing model error and class imbalance. Given the exploratory character of the work, the analyses were not pre-defined, only data collection, sampling, and the model-training strategy were specified in advance. Concerns about classifier interpretability informed the evaluation strategy [@gilpinExplainingExplanationsOverview2018]. Two research questions were formulated: $RQ_1$ on the prevalence of OD and OM among statistical-inference (SI) publications, and $RQ_2$ on the prevalence of preregistration. After extensive model development, validation, calibration, thresholding, and misclassification adjustment, prevalences for OD, OM, and Preregistration were too low for the ML classifiers to yield interpretable, adjusted estimates. In contrast, a question that was not originally foregrounded proved answerable: the prevalence and trajectory of OA among SI publications, measured from metadata with high reliability, show clear increases over time.
+The research design was deliberately designed to study open-science practices via supervised classifiers rather than relying exclusively on metadata. This choice prioritized scalability and the potential to capture practice signals that metadata may miss, at the cost of managing model error and class imbalance. Given the exploratory character of the work, the analyses were not pre-defined, only data collection, sampling, and the model-training strategy were specified in advance. Concerns about classifier interpretability informed the evaluation strategy [@gilpinExplainingExplanationsOverview2018]. Two research questions were formulated: $RQ_1$ on the prevalence of OD and OM among statistical-inference (SI) publications, and $RQ_2$ on the prevalence of preregistration. After extensive model development, validation, calibration, thresholding, and misclassification adjustment, prevalences for OD, OM, and Preregistration were too low for the ML classifiers to yield interpretable, adjusted estimates. 

-$$
-\text{Accuracy} = \frac{TP + TN}{N} \quad \text{and} \quad
-\kappa = \frac{p_o - p_e}{1 - p_e}
-$$
+The ML classifiers trained on GPT labels inherit GPT's strengths and the data's sparsity.For the relatively small 20% validation set coded by GPT, the open-science practice classifiers are less precise and less reliable than the Statistical-Inference classifier.

-As expected, $\kappa$ is typically lower than Accuracy due to chance-agreement correction [@naiduReviewEvaluationMetrics2023].
-
-Category-specific results highlight class-imbalance constraints. Preregistration has only two positives in the validation sample, which makes any estimate imprecise, also resulting in a very undesirably large p-value of the accuracy-no-information-rate assumption[^1]. OM shows one false negative among six positives, and OD shows one false negative among eight positives. The SI classifier shows five false positives alongside one hundred twelve true positives and no false negatives, with all metrics indicating excellent performance.
-
-[^1]: The accuracy-no-information-rate p-value tests the null hypothesis that the accuracy is equal to the no-information rate or the accuracy when always predicting the most frequent class [@kuhnBuildingPredictiveModels2008].
-
-The ML classifiers trained on GPT labels inherit GPT's strengths and the data's sparsity.For the relatively small 20% validation set coded by GPT, the open-science practice classifiers are less precise and less reliable than the Statistical-Inference classifier. Preregistration (@fig-plt-eval-pr) appears strongest (balanced accuracy $= 99.2\%$, $F_1 = 88.9\%$, $\kappa = 88.1\%$), but the counts are sparse (four true positives, one false negative, no false positives), and the p-value versus the no-information rate ($p = 0.0853$) is not conventionally significant-an expected consequence of the very low base rate rather than a systematic error.
+In contrast, a question that was not originally foregrounded proved answerable: the prevalence and trajectory of OA among SI publications, measured from metadata with high reliability, show clear increases over time.

+Before misclassification adjustment, design-based prevalences were estimated among SI papers with 95% CIs. For outcomes identified by the ML classifiers (OD, OM, Preregistration), these reflect survey-design uncertainty only. @fig-osp-adoption shows a steady rise in OA from \~20% in 2013 to \~50% in 2023, while the other practices suffer from extremely low counts; for some years (e.g., 2013 OD; 2016 Preregistration) estimates were not possible. @tbl-osp-prev-overall confirms low prevalences across the full period: OA $40.9\%$ (38.8-43.1), OM $4.3\%$ (3.4-5.3), Preregistration $3.6\%$ (2.8-4.5), and OD $2.2\%$ (1.6-2.9).

 ```{r}
 #| tbl-cap: Sample Characteristics by Statistical Inference Status
@@ -758,14 +500,6 @@ if (isTRUE(debug_mode)) {
 }
 ```

-OM (@fig-plt-eval-om) tells a different story: despite nominal Accuracy of $94.3\%$, balanced accuracy drops to $60.0\%$ and $\kappa$ to $31.7\%$. Sensitivity is $20.0\%$ while specificity is $100.0\%$, yielding $F_1 = 33.3\%$. High nominal accuracy with a large miss rate indicates accuracy inflation under imbalance, and the p-value of $0.434$ confirms that accuracy does not exceed the no-information rate meaningfully.
-
-OD (@fig-plt-eval-od) sits between these extremes: accuracy $= 88.6\%$, balanced accuracy $= 93.7\%$, sensitivity $= 100.0\%$, specificity $= 87.3\%$. The classifier captures all positives but at the cost of eight false positives against seven true positives and 55 true negatives, which depresses precision and yields $F_1 = 63.6\%$. $\kappa = 57.9\%$ indicates moderate agreement beyond chance, and $p = 0.736$ again signals that nominal accuracy is uninformative under imbalance.
-
-In short, Preregistration appears comparatively reliable, OM is recall-limited, and OD is precision-limited. These profiles motivate reporting metrics suited to extreme class imbalance-Precision $P = \frac{TP}{TP+FP}$, Recall $R = \frac{TP}{TP+FN}$, balanced accuracy $BA = \frac{P+R}{2}$ - and anticipating how errors propagate into downstream estimates [@murphyMachineLearningProbabilistic2012; @fawcettIntroductionROCAnalysis2006].
-
-Before misclassification adjustment, design-based prevalences were estimated among SI papers with 95% CIs. For outcomes identified by the ML classifiers (OD, OM, Preregistration), these reflect survey-design uncertainty only. @fig-osp-adoption shows a steady rise in OA from \~20% in 2013 to \~50% in 2023, while the other practices suffer from extremely low counts; for some years (e.g., 2013 OD; 2016 Preregistration) estimates were not possible. @tbl-osp-prev-overall confirms low prevalences across the full period: OA $40.9\%$ (38.8-43.1), OM $4.3\%$ (3.4-5.3), Preregistration $3.6\%$ (2.8-4.5), and OD $2.2\%$ (1.6-2.9).
-
 ```{r}
 #| fig-cap: OSP Adoption Over Time, among statistical inference papers (design-weighted)
 #| label: fig-osp-adoption
@@ -1298,16 +1032,10 @@ To make sure, that our results are robust, reliable and credible, this work shal

 Materials, Data and Code are made available at a public OSF-repository that can be accessed here:

- https://osf.io/c82au/?view_only=2c3a6a46a7274a25bc7c21120b29936d. 
+- https://osf.io/rvpc3/overview?view_only=0307dc0d99f74b50a738720a4a757aa0. 

 Further instructions can be found in the README file. Full-text data and the downloader can't be made available to the public due to copyright concerns. An encrypted, password-protected file for each containing the full-texts is available in the repository. 

-::: callout-important
-
-Full reproducibility can't be guaranteed due to the dependency on data that is available online and thereby prone to constant changes (the unpaywall DB as well as the crossref data is updated constantly).
-
-:::
-
 ```{=latex}
 \newpage
 ```
@@ -1328,172 +1056,6 @@ Full reproducibility can't be guaranteed due to the dependency on data that is a
 \renewcommand\thesection{\Alph{section}}
 ```

-# SciPaperLoader {#sec-supplements-downloader}
-
-```{=latex}
-\begin{center}
-````
-
-![](img/app_screenshots/2025-08-04-174744_hyprshot.png){width=70%}
-\captionsetup{type=figure}\captionof{figure}{WebApp: Control Panel}
-
-```{=latex}
-\end{center}
-\clearpage
-\newpage
-```
-
-# Tables: OSP Adoption Over Time Among Statistical Inference Papers
-
-## OSP Adoption Over Time Among Statistical Inference Papers {#sec-osp-adoption-tables}
-
-```{r}
-#| tbl-caption: Preregistration
-# rename published_year prop prop_low prop_upp variable
-tbl_osp_prev_overall_dsadj_a <- tbl_osp_prev_overall_dsadj %>% 
-  select(published_year, prop, prop_low, prop_upp, variable) %>%
-  filter(variable == "Preregistration") %>%
-  select(-variable) %>%
-  mutate(
-    prop = percent(prop, accuracy = 0.1),
-    prop_low = percent(prop_low, accuracy = 0.1),
-    prop_upp = percent(prop_upp, accuracy = 0.1),
-  ) %>%
-  rename(
-    `Year` = published_year,
-    `Proportion` = prop,
-    `.95 CI (Lower)` = prop_low,
-    `.95 CI (Upper)` = prop_upp
-  ) %>%
-  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Preregistration") %>%
-  kable_styling(position = "center", full_width = FALSE)
-
-if(output_format == "pdf/tex") {
-  tbl_osp_prev_overall_dsadj_a
-} else {
-  print("Table: Preregistration")
-}
-
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-```{r}
-#| tbl-caption: Open Data
-# rename published_year prop prop_low prop_upp variable
-tbl_osp_prev_overall_dsadj_b <- tbl_osp_prev_overall_dsadj %>% 
-  select(published_year, prop, prop_low, prop_upp, variable) %>%
-  filter(variable == "Open Data") %>%
-  select(-variable) %>%
-  mutate(
-    prop = percent(prop, accuracy = 0.1),
-    prop_low = percent(prop_low, accuracy = 0.1),
-    prop_upp = percent(prop_upp, accuracy = 0.1),
-  ) %>%
-  rename(
-    `Year` = published_year,
-    `Proportion` = prop,
-    `.95 CI (Lower)` = prop_low,
-    `.95 CI (Upper)` = prop_upp
-  ) %>%
-  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Data") %>%
-  kable_styling(position = "center", full_width = FALSE)
-
-if(output_format == "pdf/tex") {
-  tbl_osp_prev_overall_dsadj_b
-} else {
-  print("Table: Open Data")
-}
-
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-```{=latex}
-\clearpage
-\newpage
-```
-
-```{r}
-# rename published_year prop prop_low prop_upp variable
-tbl_osp_prev_overall_dsadj_c <- tbl_osp_prev_overall_dsadj %>% 
-  select(published_year, prop, prop_low, prop_upp, variable) %>%
-  filter(variable == "Open Materials") %>%
-  select(-variable) %>%
-  mutate(
-    prop = percent(prop, accuracy = 0.1),
-    prop_low = percent(prop_low, accuracy = 0.1),
-    prop_upp = percent(prop_upp, accuracy = 0.1),
-  ) %>%
-  rename(
-    `Year` = published_year,
-    `Proportion` = prop,
-    `.95 CI (Lower)` = prop_low,
-    `.95 CI (Upper)` = prop_upp
-  ) %>%
-  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Materials") %>%
-  kable_styling(position = "center", full_width = FALSE)
-
-if(output_format == "pdf/tex") {
-  tbl_osp_prev_overall_dsadj_c
-} else {
-  print("Table: Open Materials")
-}
-
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-```{r}
-# rename published_year prop prop_low prop_upp variable
-tbl_osp_prev_overall_dsadj_d <- tbl_osp_prev_overall_dsadj %>% 
-  select(published_year, prop, prop_low, prop_upp, variable) %>%
-  filter(variable == "Open Access") %>%
-  select(-variable) %>%
-  mutate(
-    prop = percent(prop, accuracy = 0.1),
-    prop_low = percent(prop_low, accuracy = 0.1),
-    prop_upp = percent(prop_upp, accuracy = 0.1),
-  ) %>%
-  rename(
-    `Year` = published_year,
-    `Proportion` = prop,
-    `.95 CI (Lower)` = prop_low,
-    `.95 CI (Upper)` = prop_upp
-  ) %>%
-  kable(digits = 2, row.names = FALSE, booktabs = TRUE, caption = "Open Access") %>%
-  kable_styling(position = "center", full_width = FALSE)
-
-if(output_format == "pdf/tex") {
-  tbl_osp_prev_overall_dsadj_d
-} else {
-  print("Table: Open Access")
-}
-
-if (isTRUE(debug_mode)) {
-  debug_info[[knitr::opts_current$get("label")]] <- 
-    if (knitr::is_html_output()) "HTML" else "LaTeX"
-}
-```
-
-```{=latex}
-\clearpage
-```
-
-# Evaluation Metrics {#sec-evaluation-metrics}
-
-![Evaluation Metrics: Open Data](figures/combined_plot_is_open_data.pdf){#fig-plt-eval-od fig-pos=H}
-
-![Evaluation Metrics: Open Materials](figures/combined_plot_is_open_materials.pdf){#fig-plt-eval-om fig-pos=H}
-
-![Evaluation Metrics: Preregistration](figures/combined_plot_is_prereg.pdf){#fig-plt-eval-pr fig-pos=H}
-
 ```{r}
 #| results: asis