MiningTransparencyManuscript/literature/Thesis.bib

@incollection{4ff8afa9-5c92-3c50-b832-a1756ccbeedc,
  title = {The Fourth Rule: {{REPLICATE WHERE POSSIBLE}}},
  booktitle = {Seven Rules for Social Research},
  author = {Firebaugh, Glenn},
  year = {2008},
  eprint = {j.ctv39x5wn.7},
  eprinttype = {jstor},
  pages = {90--119},
  publisher = {Princeton University Press},
  urldate = {2025-08-23},
  abstract = {Rule 4 is thereplication rule. The replication rule is a natural follow-up to rule 3, ``Build reality checks into your research.'' Rule 3 advises you to look for ways to cross-check your results both internally---using other information in your data set---and externally---using different methods and data sets. In multiple-method research, as described in the previous chapter, your aim is to see if different methods and different sorts of data lead to the same conclusions.Rule 4 advises replication---the identical analysis (same measures, models, and estimation methods) of parallel data sets (different samples of the same}
}

@article{sang-woonResearchPaperClassification2019,
  title = {Research Paper Classification Systems Based on {{TF-IDF}} and {{LDA}} Schemes},
  author = {Kim, Sang-Woon and Gil, Joon-Min},
  year = 2019,
  month = aug,
  journal = {Human-centric Computing and Information Sciences},
  volume = {9},
  number = {1},
  pages = {30},
  issn = {2192-1962},
  doi = {10.1186/s13673-019-0192-7},
  urldate = {2024-12-16},
  abstract = {With the increasing advance of computer and information technologies, numerous research papers have been published online as well as offline, and as new research fields have been continuingly created, users have a lot of trouble in finding and categorizing their interesting research papers. In order to overcome the limitations, this paper proposes a research paper classification system that can cluster research papers into the meaningful class in which papers are very likely to have similar subjects. The proposed system extracts representative keywords from the abstracts of each paper and topics by Latent Dirichlet allocation (LDA) scheme. Then, the K-means clustering algorithm is applied to classify the whole papers into research papers with similar subjects, based on the Term frequency-inverse document frequency (TF-IDF) values of each paper.},
  langid = {english},
  keywords = {Artificial Intelligence,K-means clustering,LDA,Paper classification,TF-IDF},
  file = {/home/michaelb/Zotero/storage/23YFBPYR/Kim and Gil - 2019 - Research paper classification systems based on TF-IDF and LDA schemes.pdf}
}

@inproceedings{ramosUsingTFIDFDetermine2003,
  title = {Using {{TF-IDF}} to {{Determine Word Relevance}} in {{Document Queries}}},
  author = {Ramos, J. E.},
  year = 2003,
  urldate = {2026-05-18},
  abstract = {In this paper, we examine the results of applying Term Frequency Inverse Document Frequency (TF-IDF) to determine what words in a corpus of documents might be more favorable to use in a query. As the term implies, TF-IDF calculates values for each word in a document through an inverse proportion of the frequency of the word in a particular document to the percentage of documents the word appears in. Words with high TF-IDF numbers imply a strong relationship with the document they appear in, suggesting that if that word were to appear in a query, the document could be of interest to the user. We provide evidence that this simple algorithm efficiently categorizes relevant words that can enhance query retrieval.}
}

@article{gonzalez-salaCaracterizacionPsicologiaJuridica2017,
  title = {Characterization of {{Legal Psychology}} through Psychology Journals Included in {{Criminology}} \& {{Penology}} and {{Law}} Categories of {{Web}} of {{Science}}},
  author = {{Gonz{\'a}lez-Sala}, Francisco and {Osca-Lluch}, Julia and Tortosa Gil, Francisco and Pe{\~n}aranda Ortega, Mar{\'i}a},
  year = 2017,
  month = mar,
  journal = {Anales de Psicolog\'ia},
  volume = {33},
  number = {2},
  pages = {411},
  issn = {1695-2294, 0212-9728},
  doi = {10.6018/analesps.33.2.262591},
  urldate = {2026-05-18},
  abstract = {The objective of this work is to learn about the most relevant aspects that characterize contemporary Legal Psychology throughout the study of journals included in the WoS between the years 2009 and 2014 related with the area of Psychology. The number of selected publications is 16, mainly from the USA and Great Britain. The results show an increase in the number of works and authors, a greater collaboration and a growth in medium productors. It exists a major presence of men in editorial boards and as authors, outstanding the figures of T. Ward in 2009 and A. Vrij in 2014. According to the analysis of key words the most relevant themes during these years have been Crime, Conduct, Woman and Meta-analysis, being sexual violence towards children and women and gender violence the criminal typology most studied.},
  copyright = {http://revistas.um.es/analesps/about/submissions\#copyrightNotice},
  file = {/home/michaelb/Zotero/storage/KC3L68AL/González-Sala et al. - 2017 - Characterization of Legal Psychology through psychology journals included in Criminology & Penology.pdf}
}

@inproceedings{abdennourEnsembleLearningModel2023,
  title = {Ensemble {{Learning Model}} for~{{Medical Text Classification}}},
  booktitle = {Web {{Information Systems Engineering}} -- {{WISE}} 2023},
  author = {Abdennour, Ghada Ben and Gasmi, Karim and Ejbali, Ridha},
  editor = {Zhang, Feng and Wang, Hua and Barhamgi, Mahmoud and Chen, Lu and Zhou, Rui},
  year = {2023},
  pages = {3--12},
  publisher = {Springer Nature},
  address = {Singapore},
  doi = {10.1007/978-981-99-7254-8_1},
  abstract = {Automatic text classification, in which textual data is categorized into specified categories based on its content, is a classic issue in the science of Natural Language Processing (NLP). These models have proven useful when applied to data with several dimensions, including sparse features. It would appear that machine learning and other statistical approaches, like those employed in medical text classification, are highly effective for these jobs. Yet a lot of manual labor is still needed to classify the massive dataset used for training. Pretrained language models, such as machine learning models, have been proven effective in recent studies, demonstrating their capacity to reduce the time and effort spent on feature engineering. Yet, there is no statistically significant improvement in performance when applying the machine learning model directly to the classification job. We present a RFSVM algorithm-based hybrid machine learning model to boost the accuracy of the machine learning prediction. The model has three steps: (1) medical text processing; (2) medical text feature extraction; and (3) ensemble learning model for text classification. Using the PubMed dataset, we conducted experiments demonstrating that the proposed strategy greatly improves the precision of the results.},
  isbn = {978-981-99-7254-8},
  langid = {english},
  keywords = {Classification,Ensemble learning,Hybridisation,Medical Text},
  file = {/home/michaelb/Zotero/storage/2IBX8VGK/Abdennour et al. - 2023 - Ensemble Learning Model for Medical Text Classification.pdf}
}

@inproceedings{abdollahiOntologybasedTwoStageApproach2019,
  title = {An {{Ontology-based Two-Stage Approach}} to {{Medical Text Classification}} with {{Feature Selection}} by {{Particle Swarm Optimisation}}},
  booktitle = {2019 {{IEEE Congress}} on {{Evolutionary Computation}} ({{CEC}})},
  author = {Abdollahi, Mahdi and Gao, Xiaoying and Mei, Yi and Ghosh, Shameek and Li, Jinyan},
  year = {2019},
  month = jun,
  pages = {119--126},
  doi = {10.1109/CEC.2019.8790259},
  urldate = {2024-12-16},
  abstract = {Document classification (DC) is the task of assigning pre-defined labels to unseen documents by utilizing a model trained on the available labeled documents. DC has attracted much attention in medical fields recently because many issues can be formulated as a classification problem. It can assist doctors in decision making and correct decisions can reduce the medical expenses. Medical documents have special attributes that distinguish them from other texts and make them difficult to analyze. For example, many acronyms and abbreviations, and short expressions make it more challenging to extract information. The classification accuracy of the current medical DC methods is not satisfactory. The goal of this work is to enhance the input feature sets of the DC method to improve the accuracy. To approach this goal, a novel two-stage approach is proposed. In the first stage, a domain-specific dictionary, namely the Unified Medical Language System (UMLS), is employed to extract the key features belonging to the most relevant concepts such as diseases or symptoms. In the second stage, PSO is applied to select more related features from the extracted features in the first stage. The performance of the proposed approach is evaluated on the 2010 Informatics for Integrating Biology and the Bedside (i2b2) data set which is a widely used medical text dataset. The experimental results show substantial improvement by the proposed method on the accuracy of classification.},
  keywords = {Conceptualization,Diseases,Feature extraction,Feature Selection,Medical Text Classification,Ontology,Particle swarm optimization,Particle Swarm Optimization,Task analysis,Text mining,Unified modeling language},
  file = {/home/michaelb/Zotero/storage/IG9J8G67/Abdollahi et al. - 2019 - An Ontology-based Two-Stage Approach to Medical Text Classification with Feature Selection by Partic.pdf;/home/michaelb/Zotero/storage/MLFVZT8V/8790259.html}
}

@article{abtRegisteredReportsJournal2021,
  title = {Registered {{Reports}} in the {{Journal}} of {{Sports Sciences}}},
  author = {Abt, Grant and Boreham, Colin and Davison, Gareth and Jackson, Robin and Wallace, Eric and Williams, A Mark},
  year = {2021},
  month = aug,
  journal = {Journal of Sports Sciences},
  volume = {39},
  number = {16},
  pages = {1789--1790},
  publisher = {Routledge},
  issn = {0264-0414},
  doi = {10.1080/02640414.2021.1950974},
  urldate = {2024-11-06},
  pmid = {34379576},
  file = {/home/michaelb/Zotero/storage/RKLIRD6R/Abt et al. - 2021 - Registered Reports in the Journal of Sports Sciences.pdf}
}

@article{acciaiEstimatingSocialBias2023,
  title = {Estimating Social Bias in Data Sharing Behaviours: An Open Science Experiment},
  shorttitle = {Estimating Social Bias in Data Sharing Behaviours},
  author = {Acciai, Claudia and Schneider, Jesper W. and Nielsen, Mathias W.},
  year = {2023},
  month = apr,
  journal = {Scientific Data},
  volume = {10},
  number = {1},
  pages = {233},
  publisher = {Nature Publishing Group},
  issn = {2052-4463},
  doi = {10.1038/s41597-023-02129-8},
  urldate = {2025-08-26},
  abstract = {Open data sharing is critical for scientific progress. Yet, many authors refrain from sharing scientific data, even when they have promised to do so. Through a preregistered, randomized audit experiment (N\,=\,1,634), we tested possible ethnic, gender and status-related bias in scientists' data-sharing willingness. 814 (54\%) authors of papers where data were indicated to be `available upon request' responded to our data requests, and 226 (14\%) either shared or indicated willingness to share all or some data. While our preregistered hypotheses regarding bias in data-sharing willingness were not confirmed, we observed systematically lower response rates for data requests made by putatively Chinese treatments compared to putatively Anglo-Saxon treatments. Further analysis indicated a theoretically plausible heterogeneity in the causal effect of ethnicity on data-sharing. In interaction analyses, we found indications of lower responsiveness and data-sharing willingness towards male but not female data requestors with Chinese names. These disparities, which likely arise from stereotypic beliefs about male Chinese requestors' trustworthiness and deservingness, impede scientific progress by preventing the free circulation of knowledge.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Scientific community,Social sciences},
  file = {/home/michaelb/Zotero/storage/TYECU32E/Acciai et al. - 2023 - Estimating social bias in data sharing behaviours an open science experiment.pdf}
}

@article{agExpandingEarlyLate2018,
  title = {Expanding the Early and Late Starter Model of Criminal Justice Involvement for Forensic Mental Health Clients},
  author = {Ag, Crocker and Ms, Martin and Mc, Leclair and Tl, Nicholls and Mc, Seto},
  year = {2018},
  month = feb,
  journal = {Law and human behavior},
  volume = {42},
  number = {1},
  publisher = {Law Hum Behav},
  issn = {1573-661X},
  doi = {10.1037/lhb0000269},
  urldate = {2025-07-26},
  abstract = {The early and late starter model provides one of the most enduring frameworks for understanding the developmental course and severity of violence and criminality among individuals with severe mental illness. We expanded the model to account for differences in the age of onset of criminal behavior an {\dots}},
  langid = {english},
  pmid = {29172557},
  file = {/home/michaelb/Zotero/storage/FMMUUNFP/29172557.html}
}

@article{agrestiApproximateBetterExact1998,
  title = {Approximate {{Is Better}} than "{{Exact}}" for {{Interval Estimation}} of {{Binomial Proportions}}},
  author = {Agresti, Alan and Coull, Brent A.},
  year = {1998},
  journal = {The American Statistician},
  volume = {52},
  number = {2},
  eprint = {2685469},
  eprinttype = {jstor},
  pages = {119--126},
  publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
  issn = {0003-1305},
  doi = {10.2307/2685469},
  urldate = {2025-08-03},
  abstract = {For interval estimation of a proportion, coverage probabilities tend to be too large for "exact" confidence intervals based on inverting the binomial test and too small for the interval based on inverting the Wald large-sample normal test (i.e., sample proportion {\dbend} z-score {\dbend} estimated standard error). Wilson's suggestion of inverting the related score test with null rather than estimated standard error yields coverage probabilities close to nominal confidence levels, even for very small sample sizes. The 95\% score interval has similar behavior as the adjusted Wald interval obtained after adding two "successes" and two "failures" to the sample. In elementary courses, with the score and adjusted Wald methods it is unnecessary to provide students with awkward sample size guidelines.},
  file = {/home/michaelb/Zotero/storage/E4H5LRDH/Agresti and Coull - 1998 - Approximate Is Better than Exact for Interval Estimation of Binomial Proportions.pdf}
}

@book{agrestiIntroductionCategoricalData2007,
  title = {An Introduction to Categorical Data Analysis},
  author = {Agresti, Alan},
  year = {2007},
  series = {Wiley Series in Probability and Statistics},
  edition = {2nd ed},
  publisher = {Wiley-Interscience},
  address = {Hoboken (N.J.)},
  isbn = {978-0-471-22618-5},
  langid = {english},
  lccn = {519.535}
}

@article{ahmadisharafRepresentativeSampleSize2024,
  title = {Representative {{Sample Size}} for {{Estimating Saturated Hydraulic Conductivity}} via {{Machine Learning}}: {{A Proof-Of-Concept Study}}},
  shorttitle = {Representative {{Sample Size}} for {{Estimating Saturated Hydraulic Conductivity}} via {{Machine Learning}}},
  author = {Ahmadisharaf, Amin and Nematirad, Reza and Sabouri, Sadra and Pachepsky, Yakov and Ghanbarian, Behzad},
  year = {2024},
  journal = {Water Resources Research},
  volume = {60},
  number = {8},
  pages = {e2023WR036783},
  issn = {1944-7973},
  doi = {10.1029/2023WR036783},
  urldate = {2025-03-28},
  abstract = {Machine learning (ML) has been extensively applied in various disciplines. However, not much attention has been paid to data heterogeneity in databases and number of samples used to train ML models in hydrology. In this study, we addressed these issues and their impacts on the accuracy and reliability of ML models in the estimation of saturated hydraulic conductivity, Ks. We selected 17,990 soil samples from the USKSAT database and created random subsets N = 2,000, 4,000, 6,000, 8,000, 10,000, 12,000, 14,000, 16,000, and 17,990, 80\% of which were used for training. The random subset selection was repeated 50 times. The extreme gradient boosting (XGBoost) algorithm was used to estimate Ks from other soil properties, such as bulk density, soil depth, texture, and organic content. For each subset, we conducted the learning curve analysis on the training and cross-validation data sets. Results showed that for all training sample sizes the number of samples was not enough for the training and cross-validation curves to reach a plateau. We also applied the concept of representative elementary volume by plotting the average coefficient of determination, R2, and root mean square log-transformed error, RMSLE, against the training sample size. For the testing data set, as the number of training sample size increased from 1,600 to 14,392 the average R2 value increased from 0.74 to 0.90, while the average RMSLE value decreased from 1.08 to 0.69. Either the learning curve or representative sample size analysis is required to investigate whether the number of samples is enough or not.},
  copyright = {{\copyright} 2024. The Author(s).},
  langid = {english},
  keywords = {data,heterogeneity,machine learning,representative sample size,saturated hydraulic conductivity},
  file = {/home/michaelb/Zotero/storage/NMUU5EZH/Ahmadisharaf et al. - 2024 - Representative Sample Size for Estimating Saturated Hydraulic Conductivity via Machine Learning A P.pdf}
}

@inproceedings{ahmedDetectionOnlineFake2017,
  title = {Detection of {{Online Fake News Using N-Gram Analysis}} and {{Machine Learning Techniques}}},
  booktitle = {Intelligent, {{Secure}}, and {{Dependable Systems}} in {{Distributed}} and {{Cloud Environments}}},
  author = {Ahmed, Hadeer and Traore, Issa and Saad, Sherif},
  editor = {Traore, Issa and Woungang, Isaac and Awad, Ahmed},
  year = {2017},
  pages = {127--138},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-319-69155-8_9},
  abstract = {Fake news is a phenomenon which is having a significant impact on our social life, in particular in the political world. Fake news detection is an emerging research area which is gaining interest but involved some challenges due to the limited amount of resources (i.e., datasets, published literature) available. We propose in this paper, a fake news detection model that use n-gram analysis and machine learning techniques. We investigate and compare two different features extraction techniques and six different machine classification techniques. Experimental evaluation yields the best performance using Term Frequency-Inverted Document Frequency (TF-IDF) as feature extraction technique, and Linear Support Vector Machine (LSVM) as a classifier, with an accuracy of 92\%.},
  isbn = {978-3-319-69155-8},
  langid = {english},
  keywords = {Fake news detection,N-gram analysis,Online fake news,Online social network security,Text classification},
  file = {/home/michaelb/Zotero/storage/FSN9J57K/Ahmed et al. - 2017 - Detection of Online Fake News Using N-Gram Analysis and Machine Learning Techniques.pdf}
}

@article{aissaouiferhiEnhancingDiagnosticAccuracy2024,
  title = {Enhancing Diagnostic Accuracy in Symptom-Based Health Checkers: A Comprehensive Machine Learning Approach with Clinical Vignettes and Benchmarking},
  shorttitle = {Enhancing Diagnostic Accuracy in Symptom-Based Health Checkers},
  author = {Aissaoui Ferhi, Leila and Ben Amar, Manel and Choubani, Fethi and Bouallegue, Ridha},
  year = {2024},
  month = oct,
  journal = {Frontiers in Artificial Intelligence},
  volume = {7},
  publisher = {Frontiers},
  issn = {2624-8212},
  doi = {10.3389/frai.2024.1397388},
  urldate = {2025-01-13},
  langid = {english},
  keywords = {benchmarking,clinical vignettes,confusion matrix,health checker,machine learning,precision-recall curve,ROC/AUC curves,symptoms},
  file = {/home/michaelb/Zotero/storage/QWEBC9KJ/Aissaoui Ferhi et al. - 2024 - Enhancing diagnostic accuracy in symptom-based health checkers a comprehensive machine learning app.pdf}
}

@article{akbaritabarGenderPatternsPublication2021,
  title = {Gender {{Patterns}} of {{Publication}} in {{Top Sociological Journals}}},
  author = {Akbaritabar, Aliakbar and Squazzoni, Flaminio},
  year = {2021},
  month = may,
  journal = {Science, Technology, \& Human Values},
  volume = {46},
  number = {3},
  pages = {555--576},
  publisher = {SAGE Publications Inc},
  issn = {0162-2439},
  doi = {10.1177/0162243920941588},
  urldate = {2024-12-15},
  abstract = {This article examines publication patterns over the last seventy years from the American Sociological Review and American Journal of Sociology, the two most prominent journals in sociology. We reconstructed the gender of all published authors and each author's academic pedigree. Results would suggest that these journals published disproportionally more articles by male authors and their coauthors. These gender inequalities persisted even when considering citations and after controlling for the influence of academic affiliation. It would seem that the potentially positive advantage of working in a prestigious, elite sociology department, in terms of better learning environment and reputational signal, for higher publication opportunities only significantly benefits male authors. While our findings do not mean that these journals have biased internal policies or implicit practices, this publication pattern needs to be considered especially regarding the possibility of their ``social closure'' and isomorphism.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/Z2P2N3KM/Akbaritabar and Squazzoni - 2021 - Gender Patterns of Publication in Top Sociological Journals.pdf}
}

@article{akkerPreregistrationSecondaryData2021,
  title = {Preregistration of Secondary Data Analysis: {{A}} Template and Tutorial},
  shorttitle = {Preregistration of Secondary Data Analysis},
  author = {van den Akker, Olmo R. and Weston, Sara and Campbell, Lorne and Chopik, Bill and Damian, Rodica and {Davis-Kean}, Pamela and Hall, Andrew and Kosie, Jessica and Kruse, Elliott and Olsen, Jerome and Ritchie, Stuart and Valentine, K. D. and van 't Veer, Anna and Bakker, Marjan},
  year = {2021},
  month = nov,
  journal = {Meta-Psychology},
  volume = {5},
  issn = {2003-2714},
  doi = {10.15626/MP.2020.2625},
  urldate = {2024-11-06},
  abstract = {Preregistration has been lauded as one of the solutions to the so-called `crisis of confidence' in the social sciences and has therefore gained popularity in recent years. However, the current guidelines for preregistration have been developed primarily for studies where new data will be collected. Yet, preregistering secondary data analyses--- where new analyses are proposed for existing data---is just as important, given that researchers' hypotheses and analyses may be biased by their prior knowledge of the data. The need for proper guidance in this area is especially desirable now that data is increasingly shared publicly. In this tutorial, we present a template specifically designed for the preregistration of secondary data analyses and provide comments and a worked example that may help with using the template effectively. Through this illustration, we show that completing such a template is feasible, helps limit researcher degrees of freedom, and may make researchers more deliberate in their data selection and analysis efforts.},
  copyright = {Copyright (c) 2021 Olmo van den Akker, Sara Weston, Lorne Campbell, Bill Chopik, Rodica Damian, Pamela Davis-Kean, Andrew Hall, Jessica Kosie, Elliott Kruse, Jerome Olsen, Stuart Ritchie, KD Valentine, Anna van 't Veer, Marjan Bakker},
  langid = {english},
  keywords = {preregistration,secondary data analysis},
  file = {/home/michaelb/Zotero/storage/YH9JQF8M/Akker et al. - 2021 - Preregistration of secondary data analysis A template and tutorial.pdf}
}

@article{akninEmotionalRewardsProsocial2022,
  title = {The {{Emotional Rewards}} of {{Prosocial Spending Are Robust}} and {{Replicable}} in {{Large Samples}}},
  author = {Aknin, Lara B. and Dunn, Elizabeth W. and Whillans, Ashley V.},
  year = {2022},
  month = dec,
  journal = {Current Directions in Psychological Science},
  volume = {31},
  number = {6},
  pages = {536--545},
  publisher = {SAGE Publications Inc},
  issn = {0963-7214},
  doi = {10.1177/09637214221121100},
  urldate = {2025-07-25},
  abstract = {Past studies show that spending money on other people---prosocial spending---increases a person's happiness. However, foundational research on this topic was conducted prior to psychology's credibility revolution (or ``replication crisis''), so it is essential to ask whether the evidence supporting this claim is robust and replicable. Here, we consider all 15 published preregistered experiments on prosocial spending to evaluate whether there is causal evidence for the idea that spending money on other people promotes happiness. Although the evidence appears somewhat mixed, we argue that the emotional benefits of prosocial spending are robust and replicable in large samples. These benefits are particularly likely when people have some choice about whether or how to give and when they understand how their generosity makes a difference. This review provides renewed support for the idea that prosocial spending promotes happiness and offers a template for revisiting phenomena that were established prior to the credibility revolution.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/G5NI6VC4/Aknin et al. - 2022 - The Emotional Rewards of Prosocial Spending Are Robust and Replicable in Large Samples.pdf}
}

@article{aliMALGRAMachineLearning2020,
  title = {{{MALGRA}}: {{Machine Learning}} and {{N-Gram Malware Feature Extraction}} and {{Detection System}}},
  shorttitle = {{{MALGRA}}},
  author = {Ali, Muhammad and Shiaeles, Stavros and Bendiab, Gueltoum and Ghita, Bogdan},
  year = {2020},
  month = nov,
  journal = {Electronics},
  volume = {9},
  number = {11},
  pages = {1777},
  publisher = {Multidisciplinary Digital Publishing Institute},
  issn = {2079-9292},
  doi = {10.3390/electronics9111777},
  urldate = {2025-07-25},
  abstract = {Detection and mitigation of modern malware are critical for the normal operation of an organisation. Traditional defence mechanisms are becoming increasingly ineffective due to the techniques used by attackers such as code obfuscation, metamorphism, and polymorphism, which strengthen the resilience of malware. In this context, the development of adaptive, more effective malware detection methods has been identified as an urgent requirement for protecting the IT infrastructure against such threats, and for ensuring security. In this paper, we investigate an alternative method for malware detection that is based on N-grams and machine learning. We use a dynamic analysis technique to extract an Indicator of Compromise (IOC) for malicious files, which are represented using N-grams. The paper also proposes TF-IDF as a novel alternative used to identify the most significant N-grams features for training a machine learning algorithm. Finally, the paper evaluates the proposed technique using various supervised machine-learning algorithms. The results show that Logistic Regression, with a score of 98.4\%, provides the best classification accuracy when compared to the other classifiers used.},
  copyright = {http://creativecommons.org/licenses/by/3.0/},
  langid = {english},
  keywords = {API call,Decision Tree,dynamic analysis,Logistic Regression,machine learning,malware,N-grams,Naive Bayes,Random Forests,sandbox,SNDBOX},
  file = {/home/michaelb/Zotero/storage/4U5WZZJP/Ali et al. - 2020 - MALGRA Machine Learning and N-Gram Malware Feature Extraction and Detection System.pdf}
}

@article{alinMulticollinearity2010,
  title = {Multicollinearity},
  author = {Alin, Aylin},
  year = {2010},
  journal = {WIREs Computational Statistics},
  volume = {2},
  number = {3},
  pages = {370--374},
  issn = {1939-0068},
  doi = {10.1002/wics.84},
  urldate = {2025-07-28},
  abstract = {Multicollinearity refers to the linear relation among two or more variables. It is a data problem which may cause serious difficulty with the reliability of the estimates of the model parameters. In this article, multicollinearity among the explanatory variables in the multiple linear regression model is considered. Its effects on the linear regression model and some multicollinearity diagnostics for this model are presented. Copyright {\copyright} 2010 John Wiley \& Sons, Inc. This article is categorized under: Statistical Models {$>$} Linear Models Statistical Models {$>$} Multivariate Models},
  copyright = {Copyright {\copyright} 2010 John Wiley \& Sons, Inc.},
  langid = {english},
  keywords = {collinearity,correlation,ill-conditioned data,linear regression,multicollinearity},
  file = {/home/michaelb/Zotero/storage/38FGNNPQ/Alin - 2010 - Multicollinearity.pdf;/home/michaelb/Zotero/storage/JV93WAIL/wics.html}
}

@article{allemandPersonalityChangeDigitalCoaching2022,
  title = {Personality {{Change Through Digital-Coaching Interventions}}},
  author = {Allemand, Mathias and Fl{\"u}ckiger, Christoph},
  year = {2022},
  month = feb,
  journal = {Current Directions in Psychological Science},
  volume = {31},
  number = {1},
  pages = {41--48},
  publisher = {SAGE Publications Inc},
  issn = {0963-7214},
  doi = {10.1177/09637214211067782},
  urldate = {2025-07-24},
  abstract = {A highly relevant but provocative research question is whether and how one can intentionally change personality traits through psychological interventions, given that traits are relatively stable by definition. Recently, research has begun to investigate personality change through intervention in nonclinical populations. One attractive and innovative interventional avenue may lie in using digital applications to guide and support people in their desire to change their personality and trigger change processes. This article provides a rationale for nonclinical personality-change interventions and discusses motivations to change, the potential of using digital applications for intervention efforts, key studies that illustrate this emerging field of research, and future directions.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/N4TBTJL7/Allemand and Flückiger - 2022 - Personality Change Through Digital-Coaching Interventions.pdf}
}

@misc{americanpsychologicalassociationOpenScienceBadges,
  title = {Open {{Science Badges}}},
  author = {{American Psychological Association}},
  journal = {https://www.apa.org},
  urldate = {2025-08-04},
  abstract = {Open science badges are an incentive developed by the Center for Open Science for authors who share data or materials and who preregister studies and/or analysis plans.},
  howpublished = {https://www.apa.org/pubs/journals/resources/open-science-badges},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/E9T64QNZ/open-science-badges.html}
}

@article{andersonDesignBasedResearchDecade2012,
  title = {Design-{{Based Research}}: {{A Decade}} of {{Progress}} in {{Education Research}}?},
  shorttitle = {Design-{{Based Research}}},
  author = {Anderson, Terry and Shattuck, Julie},
  year = {2012},
  month = jan,
  journal = {Educational Researcher},
  volume = {41},
  number = {1},
  pages = {16--25},
  publisher = {American Educational Research Association},
  issn = {0013-189X},
  doi = {10.3102/0013189X11428813},
  urldate = {2025-07-26},
  abstract = {Design-based research (DBR) evolved near the beginning of the 21st century and was heralded as a practical research methodology that could effectively bridge the chasm between research and practice in formal education. In this article, the authors review the characteristics of DBR and analyze the five most cited DBR articles from each year of this past decade. They illustrate the context, publications, and most popular interventions utilized. They conclude that interest in DBR is increasing and that results provide limited evidence for guarded optimism that the methodology is meeting its promised benefits.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/6YHSNWWS/Anderson and Shattuck - 2012 - Design-Based Research A Decade of Progress in Education Research.pdf}
}

@article{ashbyOpenAccessAvailabilityCriminological2020,
  title = {The {{Open-Access Availability}} of {{Criminological Research}} to {{Practitioners}} and {{Policy Makers}}},
  author = {Ashby, Matthew P J},
  year = {2020},
  month = oct,
  journal = {Journal of Criminal Justice Education},
  volume = {32},
  number = {1},
  pages = {1--21},
  publisher = {Routledge},
  issn = {1051-1253},
  doi = {10.1080/10511253.2020.1838588},
  urldate = {2025-03-28},
  abstract = {Criminology produces policy-relevant research and criminologists often seek to influence practice, but most criminological research is confined to expensive subscription journals. This disadvantages researchers in the global south, policy makers and practitioners who have the skills to use research findings but do not have journal subscriptions. Open access seeks to increase availability of research, but take-up among criminologists has been low. This study used a sample of 12,541 articles published in criminology journals between 2017 and 2019 to estimate the proportion of articles available via different types of open access. Overall 22\% of research was available to non-subscribers, about half that found in other disciplines, even though authors had the right to make articles open without payment in at least 95\% of cases. Open access was even less common in many leading journals and among researchers in the United States. Open access has the potential to increase access to research for those outside academia, but few scholars exercise their existing rights to distribute freely the submitted or accepted versions of their articles online. Policies to incentivise authors to make research open access where possible are needed unlock the benefits of greater access to criminological research.},
  keywords = {Evidence-based policy,open access,translational criminology},
  file = {/home/michaelb/Zotero/storage/NTYTE8TP/Ashby - 2020 - The Open-Access Availability of Criminological Research to Practitioners and Policy Makers.pdf}
}

@article{auspurgAusmassUndRisikofaktoren2014,
  title = {{Ausma{\ss} und Risikofaktoren des Publication Bias in der deutschen Soziologie}},
  author = {Auspurg, Katrin and Hinz, Thomas and Schneck, Andreas},
  year = {2014},
  month = dec,
  journal = {KZfSS K{\"o}lner Zeitschrift f{\"u}r Soziologie und Sozialpsychologie},
  volume = {66},
  number = {4},
  pages = {549--573},
  issn = {1861-891X},
  doi = {10.1007/s11577-014-0284-3},
  urldate = {2024-12-15},
  abstract = {Die statistische Signifikanz von Forschungsergebnissen wird oft f{\"a}lschlicherweise als ein Indikator f{\"u}r deren Relevanz und Aussagekraft gehalten. Signifikante Ergebnisse werden eher ver{\"o}ffentlicht, obwohl nicht-signifikante Ergebnisse gleicherma{\ss}en f{\"u}r den Erkenntnisfortschritt bedeutsam sind. Die Folgen sind eine {\"U}bersch{\"a}tzung von Effektst{\"a}rken und eine zu optimistische Beurteilung von Theorien. Im vorliegenden Beitrag wird dem Problem des Publication Bias (PB) in der deutschen Soziologie anhand von elf Jahrg{\"a}ngen der zwei wichtigsten deutschsprachigen Soziologie-Zeitschriften (K{\"o}lner Zeitschrift f{\"u}r Soziologie und Sozialpsychologie, Zeitschrift f{\"u}r Soziologie) mithilfe des Caliper-Tests nachgegangen. Lassen sich ebenso wie in US-amerikanischen Soziologie-Zeitschriften Hinweise auf einen PB finden, und wenn ja, unter welchen Bedingungen ist dieser besonders stark ausgepr{\"a}gt? Im Mittelpunkt der Ursachenanalyse stehen M{\"o}glichkeiten der Datenmanipulation sowie der sozialen Kontrolle durch Forschende. Im Ergebnis finden sich auch f{\"u}r die deutsche Soziologie Hinweise auf einen PB, wenngleich in schw{\"a}cherem Umfang als in US-amerikanischen Zeitschriften. Einfache Ma{\ss}nahmen wie Herausgebervorgaben, wonach Daten f{\"u}r Replikationen zur Verf{\"u}gung zu stellen sind, zeigen keine durchschlagende Wirkung. Es l{\"a}sst sich lediglich eine leichte Tendenz feststellen, dass komplexe Arbeiten mit mehreren parallel zu testenden Hypothesen das PB-Risiko abmildern.},
  langid = {ngerman},
  keywords = {Caliper test,Caliper-Test,Publication bias,Publication Bias,Rational-choice,Rational-Choice,Significance testing,Signifikanztest,Sociology of science,Wissenschaftssoziologie},
  file = {/home/michaelb/Zotero/storage/BZEYCCXC/Auspurg et al. - 2014 - Ausmaß und Risikofaktoren des Publication Bias in der deutschen Soziologie.pdf;/home/michaelb/Zotero/storage/JLSSQFR2/s11577-014-0284-3.pdf}
}

@article{auspurgWhatFuelsPublication2011,
  title = {{What Fuels Publication Bias?: Theoretical and Empirical Analyses of Risk Factors Using the Caliper Test}},
  shorttitle = {{What Fuels Publication Bias?}},
  author = {Auspurg, Katrin and Hinz, Thomas},
  year = {2011},
  month = oct,
  journal = {Jahrb{\"u}cher f{\"u}r National{\"o}konomie und Statistik},
  volume = {231},
  number = {5-6},
  pages = {636--660},
  publisher = {De Gruyter Oldenbourg},
  issn = {2366-049X},
  doi = {10.1515/jbnst-2011-5-607},
  urldate = {2025-08-26},
  abstract = {Significance tests were originally developed to enable more objective evaluations of research results. Yet the strong orientation towards statistical significance encourages biased results, a phenomenon termed ``publication bias''. Publication bias occurs whenever the likelihood or time-lag of publication, or the prominence, language, impact factor of journal space or the citation rate of studies depend on the direction and significance of research findings. Although there is much evidence concerning the existence of publication bias in all scientific disciplines and although its detrimental consequences for the progress of the sciences have been known for a long time, all attempts to eliminate the bias have failed. The present article reviews the history and logic of significance testing, the state of research on publication bias, and existing practical recommendations. After demonstrating that more systematical research on the risk factors of publication bias is needed, the paper suggests two new directions for publication bias research. First, a more comprehensive theoretical model based on theories of rational choice and economics as well as on the sociology of science is sketched out. Publication bias is recognized as the outcome of a social dilemma that cannot be overcome by moral pleas alone. Second, detection methods for publication bias going beyond meta-analysis, ones that are more suitable for testing causal hypotheses, are discussed. In particular, the ``caliper test'' seems well-suited for conducting theoretically motivated comparisons across heterogeneous research fields like sociology. Its potential is demonstrated by testing hypotheses on (a) the relevance of explicitly vs. implicitly stated research propositions and on (b) the relevance of the number of authors on incidence rates of publication bias in 50 papers published in leading German sociology journals.},
  copyright = {De Gruyter expressly reserves the right to use all content for commercial text and data mining within the meaning of Section 44b of the German Copyright Act.},
  langid = {ngerman},
  keywords = {caliper test,publication bias,rational choice,Significance testing,sociology of science},
  file = {/home/michaelb/Zotero/storage/K8ME5QKF/Auspurg and Hinz - 2011 - What Fuels Publication Bias Theoretical and Empirical Analyses of Risk Factors Using the Caliper T.pdf}
}

@article{banksAnswers18Questions2019,
  title = {Answers to 18 {{Questions About Open Science Practices}}},
  author = {Banks, George C. and Field, James G. and Oswald, Frederick L. and O'Boyle, Ernest H. and Landis, Ronald S. and Rupp, Deborah E. and Rogelberg, Steven G.},
  year = {2019},
  month = jun,
  journal = {Journal of Business and Psychology},
  volume = {34},
  number = {3},
  pages = {257--270},
  issn = {1573-353X},
  doi = {10.1007/s10869-018-9547-8},
  urldate = {2024-12-16},
  abstract = {Open science refers to an array of practices that promote openness, integrity, and reproducibility in research; the merits of which are being vigorously debated and developed across academic journals, listservs, conference sessions, and professional associations. The current paper identifies and clarifies major issues related to the use of open science practices (e.g., data sharing, study pre-registration, open access journals). We begin with a useful general description of what open science in organizational research represents and adopt a question-and-answer format. Through this format, we then focus on the application of specific open science practices and explore future directions of open science. All of this builds up to a series of specific actionable recommendations provided in conclusion, to help individual researchers, reviewers, journal editors, and other stakeholders develop a more open research environment and culture.},
  langid = {english},
  keywords = {Open science,Philosophy of science,Questionable research practices,Research ethics},
  file = {/home/michaelb/Zotero/storage/C7RSDC77/Banks et al. - 2019 - Answers to 18 Questions About Open Science Practices.pdf}
}

@article{barfarCognitiveAffectiveResponses2019,
  title = {Cognitive and Affective Responses to Political Disinformation in {{Facebook}}},
  author = {Barfar, Arash},
  year = {2019},
  month = dec,
  journal = {Computers in Human Behavior},
  volume = {101},
  pages = {173--179},
  issn = {0747-5632},
  doi = {10.1016/j.chb.2019.07.026},
  urldate = {2025-07-26},
  abstract = {The epidemic of political disinformation in social media has in part triggered the transition to the post-truth era in which emotional and ideological appeals are more influential in shaping public opinion than objective facts. In this study we examined the cognitive and affective responses that political disinformation prompted in Facebook, as the most popular social media platform. Through text analysis of user comments corpora on nearly 2,100 political posts from popular sources in Facebook, we found that compared to true news, political disinformation received significantly less analytic responses from Facebook followers. While the results indicated greater anxiety in responses to true news, responses to political disinformation were filled with greater anger and incivility. We also found similar (low) levels of cognitive thinking in responses to extreme conservative and extreme liberal disinformation. Contrary to prior research findings, our results indicated that responses to extreme liberal disinformation in Facebook were filled with greater anger and incivility. This suggests that the incivility and outrage in online political discourses should not be attributed to a specific political party without considering the concurrent political events.},
  keywords = {Echo chamber,Facebook,Polarization,Political disinformation,Social media,Text analysis},
  file = {/home/michaelb/Zotero/storage/78X9N6M4/S0747563219302699.html}
}

@article{barghAutomaticitySocialBehavior1996,
  title = {Automaticity of Social Behavior: {{Direct}} Effects of Trait Construct and Stereotype Activation on Action},
  shorttitle = {Automaticity of Social Behavior},
  author = {Bargh, John A. and Chen, Mark and Burrows, Lara},
  year = {1996},
  journal = {Journal of Personality and Social Psychology},
  volume = {71},
  number = {2},
  pages = {230--244},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1315},
  doi = {10.1037/0022-3514.71.2.230},
  abstract = {Previous research has shown that trait concepts and stereotypes become active automatically in the presence of relevant behavior or stereotyped-group features. Through the use of the same priming procedures as in previous impression formation research, Experiment 1 showed that participants whose concept of rudeness was primed interrupted the experimenter more quickly and frequently than did participants primed with polite-related stimuli. In Experiment 2, participants for whom an elderly stereotype was primed walked more slowly down the hallway when leaving the experiment than did control participants, consistent with the content of that stereotype. In Experiment 3, participants for whom the African American stereotype was primed subliminally reacted with more hostility to a vexatious request of the experimenter. Implications of this automatic behavior priming effect for self-fulfilling prophecies are discussed, as is whether social behavior is necessarily mediated by conscious choice processes. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Aged (Attitudes Toward),Automatism,Personality Traits,Priming,Racial and Ethnic Attitudes,Rudeness,Social Behavior,Stereotyped Attitudes,Stereotyped Behavior},
  file = {/home/michaelb/Zotero/storage/WSZEDCKV/doiLanding.html}
}

@inproceedings{bastBenchmarkEvaluationText2017,
  title = {A {{Benchmark}} and {{Evaluation}} for {{Text Extraction}} from {{PDF}}},
  booktitle = {2017 {{ACM}}/{{IEEE Joint Conference}} on {{Digital Libraries}} ({{JCDL}})},
  author = {Bast, Hannah and Korzen, Claudius},
  year = {2017},
  month = jun,
  pages = {1--10},
  doi = {10.1109/JCDL.2017.7991564},
  urldate = {2025-08-20},
  abstract = {Extracting the body text from a PDF document is an important but surprisingly difficult task. The reason is that PDF is a layout-based format which specifies the fonts and positions of the individual characters rather than the semantic units of the text (e.g., words or paragraphs) and their role in the document (e.g., body text or caption). There is an abundance of extraction tools, but their quality and the range of their functionality are hard to determine. In this paper, we show how to construct a high-quality benchmark of principally arbitrary size from parallel TeX and PDF data. We construct such a benchmark of 12,098 scientific articles from arXiv.org and make it publicly available. We establish a set of criteria for a clean and independent assessment of the semantic abilities of a given extraction tool. We provide an extensive evaluation of 14 state-of-the-art tools for text extraction from PDF on our benchmark according to our criteria. We include our own method, Icecite, which significantly outperforms all other tools, but is still not perfect. We outline the remaining steps necessary to finally make text extraction from PDF a "solved problem".},
  keywords = {Benchmark testing,Data mining,Google,Libraries,Portable document format,Semantics,Tools}
}

@article{begleyRaiseStandardsPreclinical2012,
  title = {Raise Standards for Preclinical Cancer Research},
  author = {Begley, C. Glenn and Ellis, Lee M.},
  year = {2012},
  month = mar,
  journal = {Nature},
  volume = {483},
  number = {7391},
  pages = {531--533},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/483531a},
  urldate = {2025-08-23},
  abstract = {C. Glenn Begley and Lee M. Ellis propose how methods, publications and incentives must change if patients are to benefit.},
  copyright = {2012 Springer Nature Limited},
  langid = {english},
  keywords = {Cancer,Drug development},
  file = {/home/michaelb/Zotero/storage/FWWLMTPD/Begley and Ellis - 2012 - Raise standards for preclinical cancer research.pdf}
}

@article{bemFeelingFutureExperimental2011,
  title = {Feeling the Future: {{Experimental}} Evidence for Anomalous Retroactive Influences on Cognition and Affect},
  shorttitle = {Feeling the Future},
  author = {Bem, Daryl J.},
  year = {2011},
  journal = {Journal of Personality and Social Psychology},
  volume = {100},
  number = {3},
  pages = {407--425},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1315},
  doi = {10.1037/a0021524},
  abstract = {The term psi denotes anomalous processes of information or energy transfer that are currently unexplained in terms of known physical or biological mechanisms. Two variants of psi are precognition (conscious cognitive awareness) and premonition (affective apprehension) of a future event that could not otherwise be anticipated through any known inferential process. Precognition and premonition are themselves special cases of a more general phenomenon: the anomalous retroactive influence of some future event on an individual's current responses, whether those responses are conscious or nonconscious, cognitive or affective. This article reports 9 experiments, involving more than 1,000 participants, that test for retroactive influence by ``time-reversing'' well-established psychological effects so that the individual's responses are obtained before the putatively causal stimulus events occur. Data are presented for 4 time-reversed effects: precognitive approach to erotic stimuli and precognitive avoidance of negative stimuli; retroactive priming; retroactive habituation; and retroactive facilitation of recall. The mean effect size (d) in psi performance across all 9 experiments was 0.22, and all but one of the experiments yielded statistically significant results. The individual-difference variable of stimulus seeking, a component of extraversion, was significantly correlated with psi performance in 5 of the experiments, with participants who scored above the midpoint on a scale of stimulus seeking achieving a mean effect size of 0.43. Skepticism about psi, issues of replication, and theories of psi are also discussed. (PsycInfo Database Record (c) 2022 APA, all rights reserved)},
  keywords = {Causality,Cognition,Cognitions,Emotional States,Extrasensory Perception,Parapsychology,Precognition},
  file = {/home/michaelb/Zotero/storage/2HE3SMFK/doiLanding.html}
}

@article{benoitQuantedaPackageQuantitative2018,
  title = {Quanteda: {{An R}} Package for the Quantitative Analysis of Textual Data},
  author = {Benoit, Kenneth and Watanabe, Kohei and Wang, Haiyan and Nulty, Paul and Obeng, Adam and M{\"u}ller, Stefan and Matsuo, Akitaka},
  year = {2018},
  journal = {Journal of Open Source Software},
  volume = {3},
  number = {30},
  pages = {774},
  doi = {10.21105/joss.00774}
}

@manual{benoitStopwordsMultilingualStopword2021,
  type = {Manual},
  title = {Stopwords: {{Multilingual}} Stopword Lists},
  author = {Benoit, Kenneth and Muhr, David and Watanabe, Kohei},
  year = {2021}
}

@article{bentejacComparativeAnalysisGradient2021,
  title = {A Comparative Analysis of Gradient Boosting Algorithms},
  author = {Bent{\'e}jac, Candice and Cs{\"o}rg{\H o}, Anna and {Mart{\'i}nez-Mu{\~n}oz}, Gonzalo},
  year = {2021},
  month = mar,
  journal = {Artificial Intelligence Review},
  volume = {54},
  number = {3},
  pages = {1937--1967},
  issn = {1573-7462},
  doi = {10.1007/s10462-020-09896-5},
  urldate = {2025-08-02},
  abstract = {The family of gradient boosting algorithms has been recently extended with several interesting proposals (i.e. XGBoost, LightGBM and CatBoost) that focus on both speed and accuracy. XGBoost is a scalable ensemble technique that has demonstrated to be a reliable and efficient machine learning challenge solver. LightGBM is an accurate model focused on providing extremely fast training performance using selective sampling of high gradient instances. CatBoost modifies the computation of gradients to avoid the prediction shift in order to improve the accuracy of the model. This work proposes a practical analysis of how these novel variants of gradient boosting work in terms of training speed, generalization performance and hyper-parameter setup. In addition, a comprehensive comparison between XGBoost, LightGBM, CatBoost, random forests and gradient boosting has been performed using carefully tuned models as well as using their default settings. The results of this comparison indicate that CatBoost obtains the best results in generalization accuracy and AUC in the studied datasets although the differences are small. LightGBM is the fastest of all methods but not the most accurate. Finally, XGBoost places second both in accuracy and in training speed. Finally an extensive analysis of the effect of hyper-parameter tuning in XGBoost, LightGBM and CatBoost is carried out using two novel proposed tools.},
  langid = {english},
  keywords = {Algorithms,CatBoost,Continuous Optimization,Ensembles of classifiers,Gradient boosting,Learning algorithms,LightGBM,Machine Learning,Optimization,Random forest,Statistical Learning,XGBoost},
  file = {/home/michaelb/Zotero/storage/UWU2LVUQ/Bentéjac et al. - 2021 - A comparative analysis of gradient boosting algorithms.pdf}
}

@article{bergstraRandomSearchHyperparameter2012,
  title = {Random Search for Hyper-Parameter Optimization},
  author = {Bergstra, James and Bengio, Yoshua},
  year = {2012},
  month = feb,
  journal = {J. Mach. Learn. Res.},
  volume = {13},
  number = {null},
  pages = {281--305},
  issn = {1532-4435},
  abstract = {Grid search and manual search are the most widely used strategies for hyper-parameter optimization. This paper shows empirically and theoretically that randomly chosen trials are more efficient for hyper-parameter optimization than trials on a grid. Empirical evidence comes from a comparison with a large previous study that used grid search and manual search to configure neural networks and deep belief networks. Compared with neural networks configured by a pure grid search, we find that random search over the same domain is able to find models that are as good or better within a small fraction of the computation time. Granting random search the same computational budget, random search finds better models by effectively searching a larger, less promising configuration space. Compared with deep belief networks configured by a thoughtful combination of manual search and grid search, purely random search over the same 32-dimensional configuration space found statistically equal performance on four of seven data sets, and superior performance on one of seven. A Gaussian process analysis of the function from hyper-parameters to validation set performance reveals that for most data sets only a few of the hyper-parameters really matter, but that different hyper-parameters are important on different data sets. This phenomenon makes grid search a poor choice for configuring algorithms for new data sets. Our analysis casts some light on why recent "High Throughput" methods achieve surprising success--they appear to search through a large number of hyper-parameters because most hyper-parameters do not matter much. We anticipate that growing interest in large hierarchical models will place an increasing burden on techniques for hyper-parameter optimization; this work shows that random search is a natural baseline against which to judge progress in the development of adaptive (sequential) hyper-parameter optimization algorithms.},
  file = {/home/michaelb/Zotero/storage/EQVELQ33/Bergstra and Bengio - 2012 - Random search for hyper-parameter optimization.pdf}
}

@inbook{berners-leeIsntItSemantic2011,
  title = {Isn't It {{Semantic}}?},
  booktitle = {Leaders in {{Computing}}: {{Changing}} the Digital {{World}}},
  author = {{Berners-Lee}, Tim},
  year = {2011},
  month = sep,
  series = {{{EBO Ser}}},
  publisher = {British Computer Society, The Turpin Distribution Services Limited [distributor]},
  address = {Swindon, Biggleswade},
  urldate = {2024-03-11},
  collaborator = {Knuth, Donald and Booch, Grady and Torvalds, Linus and Wozniak, Steve and Cerf, Vint and Sp{\"a}rck Jones, Karen and {Berners-Lee}, Tim and Wales, Jimmy and Shirley, Stephanie},
  isbn = {978-1-78017-099-2},
  langid = {english},
  annotation = {OCLC: 808089194}
}

@inproceedings{bevendorffEmpiricalComparisonWeb2023a,
  title = {An {{Empirical Comparison}} of {{Web Content Extraction Algorithms}}},
  booktitle = {Proceedings of the 46th {{International ACM SIGIR Conference}} on {{Research}} and {{Development}} in {{Information Retrieval}}},
  author = {Bevendorff, Janek and Gupta, Sanket and Kiesel, Johannes and Stein, Benno},
  year = {2023},
  month = jul,
  pages = {2594--2603},
  publisher = {ACM},
  address = {Taipei Taiwan},
  doi = {10.1145/3539618.3591920},
  urldate = {2025-08-20},
  isbn = {978-1-4503-9408-6},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/M4SLU9C5/Bevendorff et al. - 2023 - An Empirical Comparison of Web Content Extraction Algorithms.pdf}
}

@article{biondiACCESSMEDICALASSISTEDREPRODUCTION2013,
  title = {{{ACCESS TO MEDICAL-ASSISTED REPRODUCTION AND PGD IN ITALIAN LAW}}: {{A DEADLY BLOW TO AN ILLIBERAL STATUTE}}? {{COMMENTARY TO THE EUROPEAN COURT ON HUMAN RIGHTS}}'{{S DECISION COSTA AND PAVAN V ITALY}} ({{ECtHR}}, 28 {{August}} 2012, {{App}}. 54270/2010)},
  shorttitle = {{{ACCESS TO MEDICAL-ASSISTED REPRODUCTION AND PGD IN ITALIAN LAW}}},
  author = {Biondi, Stefano},
  year = {2013},
  month = sep,
  journal = {Medical Law Review},
  volume = {21},
  number = {3},
  pages = {474--486},
  issn = {0967-0742},
  doi = {10.1093/medlaw/fwt010},
  urldate = {2025-07-25},
  abstract = {This article provides an account of the European Court on Human Rights' Second Section decision in the case Costa and Pavan v Italy. The judgment found that the Italian Statute on Assisted Reproduction (Law 40/2004), and particularly its prohibition to use in vitro fertilisation and pre-implantation genetic diagnosis (PGD) to prevent the birth of children affected by genetically transmissible conditions, breached Article 8 of the European Convention on Human Rights (ECHR). In fact, the statute in question permits only infertile people to access medically assisted reproduction techniques and forbids PGD and embryo selection. The Court regarded that the rationale of these prohibitions---identified by the Italian Government with the need to prevent eugenic practices as well as to protect the health of the unborn and of the woman---was at odds with the fact that Italian law allows pre-natal screening and therapeutic abortions in case foetal abnormalities are diagnosed. In order to clarify the decision's significance, the paper goes on to analyse the rationale of Law 40/2004 in the Italian legal and political context. Emphasis is placed on the fact that this statute is extremely controversial at domestic level, because many of its provisions---including those considered by the Strasbourg Court---are inherently contradictory and contrast with the settled constitutional principles on abortion, as many domestic authorities highlighted. In this context, should the commented decision be confirmed by the Grand Chamber, it may provide a basis to bring consistency back to the Italian regulation of assisted reproduction. Finally, the paper considers the appeal lodged by the Italian Government to the Grand Chamber, and in particular the contention that the European Court had failed to respect Italy's margin of appreciation. In this regard, it is argued that, under Law 40/2004, individuals face illogical and discriminatory restrictions to their right to private and family life and that therefore, even if an outright violation of Article 8 ECHR could not be found, there appears to be at least a breach of Article 8 in conjunction with Article 14 ECHR.},
  file = {/home/michaelb/Zotero/storage/XYYTKGIA/Biondi - 2013 - ACCESS TO MEDICAL-ASSISTED REPRODUCTION AND PGD IN ITALIAN LAW A DEADLY BLOW TO AN ILLIBERAL STATUT.pdf;/home/michaelb/Zotero/storage/CLV2YU92/fwt010.html}
}

@article{bishopReinFourHorsemen2019,
  title = {Rein in the Four Horsemen of Irreproducibility},
  author = {Bishop, Dorothy},
  year = {2019},
  month = apr,
  journal = {Nature},
  volume = {568},
  number = {7753},
  pages = {435--435},
  publisher = {Nature Publishing Group},
  doi = {10.1038/d41586-019-01307-2},
  urldate = {2025-08-26},
  abstract = {Dorothy Bishop describes how threats to reproducibility, recognized but unaddressed for decades, might finally be brought under control.},
  copyright = {2019 Springer Nature Limited},
  langid = {english},
  keywords = {Publishing,Research management},
  annotation = {Bandiera\_abtest: a\\
Cg\_type: World View\\
Subject\_term: Research management, Publishing},
  file = {/home/michaelb/Zotero/storage/PX7QPLWQ/Bishop - 2019 - Rein in the four horsemen of irreproducibility.pdf;/home/michaelb/Zotero/storage/PF8DF4VD/d41586-019-01307-2.html}
}

@article{blandTyrannyPowerThere2009,
  title = {The Tyranny of Power: Is There a Better Way to Calculate Sample Size?},
  shorttitle = {The Tyranny of Power},
  author = {Bland, John Martin},
  year = {2009},
  month = oct,
  journal = {BMJ},
  volume = {339},
  pages = {b3985},
  publisher = {British Medical Journal Publishing Group},
  issn = {0959-8138, 1468-5833},
  doi = {10.1136/bmj.b3985},
  urldate = {2025-08-02},
  abstract = {{$<$}p{$>$}\textbf{Martin Bland}'s extensive experience in reviewing and using power calculations has led him to believe that it is time to replace them {$<$}/p{$>$}},
  chapter = {Research Methods \&amp; Reporting},
  copyright = {{\copyright} BMJ Publishing Group Ltd 2009},
  langid = {english},
  pmid = {19808754},
  file = {/home/michaelb/Zotero/storage/LWJR23G8/bmj.html}
}

@book{bloorKeywordsQualitativeMethods2006a,
  title = {Keywords in Qualitative Methods: A Vocabulary of Research Concepts},
  shorttitle = {Keywords in Qualitative Methods},
  author = {Bloor, Michael and Wood, Fiona},
  year = {2006},
  publisher = {Sage Publications},
  address = {London ; Thousand Oaks, Calif},
  isbn = {978-0-7619-4330-3 978-0-7619-4331-0},
  lccn = {H62 .B5856 2006},
  keywords = {Qualitative research}
}

@misc{BOAI2002,
  title = {Read the Declaration},
  author = {{Budapest Open Access Initiative}},
  year = {2002},
  month = feb,
  address = {Budapest, Hungary}
}

@book{boehmkeHandsonMachineLearning2020,
  title = {Hands-on Machine Learning with {{R}}},
  author = {Boehmke, Bradley C. and Greenwell, Brandon},
  year = {2020},
  series = {Chapman \& {{Hall}}/{{CRC}} the {{R}} Series},
  publisher = {CRC Press, Taylor \& Francis Group},
  address = {Boca Raton London New York},
  doi = {10.1201/9780367816377},
  isbn = {978-1-138-49568-5 978-0-367-81637-7},
  langid = {english}
}

@article{borowskiSignificanceEarlyTemperamental2021,
  title = {The Significance of Early Temperamental Reactivity for Children's Social Competence with Peers: {{A}} Meta-Analytic Review and Comparison with the Role of Early Attachment},
  shorttitle = {The Significance of Early Temperamental Reactivity for Children's Social Competence with Peers},
  author = {Borowski, Sarah K. and Groh, Ashley M. and {Bakermans-Kranenburg}, Marian J. and Fearon, Pasco and Roisman, Glenn I. and {van IJzendoorn}, Marinus H. and Vaughn, Brian E.},
  year = {2021},
  month = nov,
  journal = {Psychological Bulletin},
  volume = {147},
  number = {11},
  pages = {1125--1158},
  issn = {1939-1455},
  doi = {10.1037/bul0000346},
  abstract = {Early temperamental reactivity and attachment security are key predictors of children's social competence with peers. Leveraging meta-analytic evaluation of the significance of early attachment for social competence already available (Groh et al., 2014), this quantitative review examined the significance of early temperamental reactivity for social competence with peers and compared the strength of this association with that for attachment. Based on 140 independent samples (u = 382; N = 49,891), the meta-analytic association between early difficult temperament and (lower) social competence was significant (r = 0.13, z = 0.13; 95\% CI [0.11, 0.16]), but decreased as time between assessments increased. Findings were similar for negative and positive emotionality. Greater negative emotionality was associated with lower social competence (r = 0.14, z = 0.14; 95\% CI [0.11, 0.17], k = 93, u = 172), and greater positive emotionality was associated with better social competence (r = 0.18, z = 0.18; 95\% CI [0.12, 0.24], k = 43, u = 54). Meta-analytic associations were reduced when overlapping informants and overlapping items in temperament and social competence assessments were excluded (difficult temperament: r = 0.10, z = 0.10; 95\% CI [0.06, 0.13]; negative emotionality: r = 0.10, z = 0.10; 95\% CI [0.05, 0.15]; positive emotionality: r = 0.10, z = 0.10; 95\% CI [0.06, 0.14]). Meta-analytic associations between these broadband temperament dimensions and social competence were smaller than the meta-analytic association between attachment security and social competence. Discussion focuses on the developmental significance of early temperament for social competence and ways to reconcile literatures on early temperament and attachment in future research on the developmental antecedents of children's social competence. (PsycInfo Database Record (c) 2022 APA, all rights reserved).},
  langid = {english},
  pmid = {35238583},
  keywords = {Child,Humans,Mood Disorders,Peer Group,Personality Disorders,Social Skills,Temperament},
  file = {/home/michaelb/Zotero/storage/P64SKSYR/Borowski et al. - 2021 - The significance of early temperamental reactivity for children's social competence with peers A me.pdf}
}

@article{bourneMappingMilitantDemocracy2017,
  title = {Mapping `{{Militant Democracy}}': {{Variation}} in {{Party Ban Practices}} in {{European Democracies}} (1945-2015)},
  shorttitle = {Mapping `{{Militant Democracy}}'},
  author = {Bourne, Angela K. and B{\'e}rtoa, Fernando Casal},
  year = {2017},
  month = jun,
  journal = {European Constitutional Law Review},
  volume = {13},
  number = {2},
  pages = {221--247},
  issn = {1574-0196, 1744-5515},
  doi = {10.1017/S1574019617000098},
  urldate = {2025-07-26},
  abstract = {Introduction -- Explaining party bans, political and legal contexts -- Banned parties and banning states in Europe, the political context -- Nature of banned parties -- Nature of banning states -- Tolerant and intolerant democracies, the legal context -- Evolving rationales for party bans and procedures for proscription -- Contemporary rationales for banning parties -- Anti-democratic ideology -- Non-democratic internal organisation -- Party names -- Party orientation to violence -- Protecting the present order -- Evolving rationales for party bans -- Weimar and legitimacy paradigms -- Conclusions, directions for future research},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/B985F7B5/Bourne and Bértoa - 2017 - Mapping ‘Militant Democracy’ Variation in Party Ban Practices in European Democracies (1945-2015).pdf}
}

@article{bradleyRoleClimateChange2020,
  title = {The Role of Climate Change Risk Perception, Response Efficacy, and Psychological Adaptation in pro-Environmental Behavior: {{A}} Two Nation Study},
  shorttitle = {The Role of Climate Change Risk Perception, Response Efficacy, and Psychological Adaptation in pro-Environmental Behavior},
  author = {Bradley, Graham L. and Babutsidze, Zakaria and Chai, Andreas and Reser, Joseph P.},
  year = {2020},
  month = apr,
  journal = {Journal of Environmental Psychology},
  volume = {68},
  pages = {101410},
  issn = {0272-4944},
  doi = {10.1016/j.jenvp.2020.101410},
  urldate = {2025-08-07},
  abstract = {As the actions of individuals contribute substantially to climate change, identifying factors that underpin environmentally-relevant behaviors represents an important step towards modifying behavior and mitigating climate change impacts. This paper introduces a sequential model in which antecedent psychological and socio-demographic variables predict climate change risk perceptions, which lead to enhanced levels of response efficacy and psychological adaptation in relation to climate change, and ultimately to environmentally-relevant behaviors. The model is tested and refined using data from large national surveys of Australian and French residents. As hypothesized, in both samples, risk perception (indirectly), response efficacy (both indirectly and directly), and psychological adaptation (directly) predicted behavior. However, these effects were stronger in the Australian than in the French sample, and other unexpectedly strong direct effects were also observed. In particular, subscribing to a ``green'' self-identity directly predicted all endogenous variables, especially in the French sample. The study provides valuable insights into the processes underlying environmentally-relevant behaviors, while serving as a reminder that effects on behavior may be nation-specific. Strategies are recommended for promoting pro-environmental behavior through the enhancement of a green identity, response efficacy, and psychological adaptation.},
  keywords = {Climate change,Green self-identity,Pro-environmental behavior,Psychological adaptation,Response efficacy,Risk perception},
  file = {/home/michaelb/Zotero/storage/G29QYK57/S0272494419306607.html}
}

@article{bradyVisualLongTermMemory2013,
  title = {Visual {{Long-Term Memory Has}} the {{Same Limit}} on {{Fidelity}} as {{Visual Working Memory}}},
  author = {Brady, Timothy F. and Konkle, Talia and Gill, Jonathan and Oliva, Aude and Alvarez, George A.},
  year = {2013},
  month = jun,
  journal = {Psychological Science},
  volume = {24},
  number = {6},
  pages = {981--990},
  publisher = {SAGE Publications Inc},
  issn = {0956-7976},
  doi = {10.1177/0956797612465439},
  urldate = {2025-08-07},
  abstract = {Visual long-term memory can store thousands of objects with surprising visual detail, but just how detailed are these representations, and how can one quantify this fidelity? Using the property of color as a case study, we estimated the precision of visual information in long-term memory, and compared this with the precision of the same information in working memory. Observers were shown real-world objects in random colors and were asked to recall the colors after a delay. We quantified two parameters of performance: the variability of internal representations of color (fidelity) and the probability of forgetting an object's color altogether. Surprisingly, the fidelity of color information in long-term memory was comparable to the asymptotic precision of working memory. These results suggest that long-term memory and working memory may be constrained by a common limit, such as a bound on the fidelity required to retrieve a memory representation.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/K6V3PX4Z/Brady et al. - 2013 - Visual Long-Term Memory Has the Same Limit on Fidelity as Visual Working Memory.pdf}
}

@book{breimanClassificationRegressionTrees2017,
  title = {Classification and {{Regression Trees}}},
  author = {Breiman, Leo and Friedman, Jerome and Olshen, R. A. and Stone, Charles J.},
  year = {2017},
  month = oct,
  publisher = {{Chapman and Hall/CRC}},
  address = {New York},
  doi = {10.1201/9781315139470},
  abstract = {The methodology used to construct tree structured rules is the focus of this monograph. Unlike many other statistical procedures, which moved from pencil and paper to calculators, this text's use of trees was unthinkable before computers. Both the practical and theoretical sides have been developed in the authors' study of tree methods. Classification and Regression Trees reflects these two sides, covering the use of trees as a data analysis method, and in a more mathematical framework, proving some of their fundamental properties.},
  isbn = {978-1-315-13947-0}
}

@article{breimanRandomForests2001,
  title = {Random {{Forests}}},
  author = {Breiman, Leo},
  year = {2001},
  month = oct,
  journal = {Machine Learning},
  volume = {45},
  number = {1},
  pages = {5--32},
  issn = {1573-0565},
  doi = {10.1023/A:1010933404324},
  urldate = {2025-08-01},
  abstract = {Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund \& R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, ***, 148--156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression.},
  langid = {english},
  keywords = {Algorithms,Categorization,classification,ensemble,Forest Ecology,Learning algorithms,Machine Learning,regression,Statistical Learning},
  file = {/home/michaelb/Zotero/storage/ZACGYP2E/Breiman - 2001 - Random Forests.pdf}
}

@article{bremertLegalAspectsText,
  title = {Legal Aspects of Text Mining Publicly Available Data},
  author = {Bremert, Benjamin},
  abstract = {The paper assesses the admissibility of data and text mining under the upcoming GDPR. It will then develop the specific risks and difficulties in compliance with transparency and information obligations under GDPR in a big data context. In the end the paper will present a possible solution for the transparency requirements in the form of an open standard for communicating allowed forms of reuse and notifying the data subject of this processing. The paper will also discuss the issues of using copyright protected material and introduce a possible legal basis for processing.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/A7SQ8AU7/Bremert - Legal aspects of text mining publicly available data.pdf}
}

@article{breznauDoesSociologyNeed2021,
  title = {Does {{Sociology Need Open Science}}?},
  author = {Breznau, Nate},
  year = {2021},
  month = mar,
  journal = {Societies},
  volume = {11},
  number = {1},
  pages = {9},
  publisher = {Multidisciplinary Digital Publishing Institute},
  issn = {2075-4698},
  doi = {10.3390/soc11010009},
  urldate = {2024-12-15},
  abstract = {Reliability, transparency, and ethical crises pushed many social science disciplines toward dramatic changes, in particular psychology and more recently political science. This paper discusses why sociology should also change. It reviews sociology as a discipline through the lens of current practices, definitions of sociology, positions of sociological associations, and a brief consideration of the arguments of three highly influential yet epistemologically diverse sociologists: Weber, Merton, and Habermas. It is a general overview for students and sociologists to quickly familiarize themselves with the state of sociology or explore the idea of open science and its relevance to their discipline.},
  copyright = {http://creativecommons.org/licenses/by/3.0/},
  langid = {english},
  keywords = {crisis of science,Habermas,Merton,open science,p-hacking,publication bias,replication,research ethics,science community,sociology legitimation,transparency,Weber},
  file = {/home/michaelb/Zotero/storage/26AZJE4S/Breznau - 2021 - Does Sociology Need Open Science.pdf}
}

@article{breznauObservingManyResearchers2022,
  title = {Observing Many Researchers Using the Same Data and Hypothesis Reveals a Hidden Universe of Uncertainty},
  author = {Breznau, Nate and Rinke, Eike Mark and Wuttke, Alexander and Nguyen, Hung H. V. and Adem, Muna and Adriaans, Jule and {Alvarez-Benjumea}, Amalia and Andersen, Henrik K. and Auer, Daniel and Azevedo, Flavio and Bahnsen, Oke and Balzer, Dave and Bauer, Gerrit and Bauer, Paul C. and Baumann, Markus and Baute, Sharon and Benoit, Verena and Bernauer, Julian and Berning, Carl and Berthold, Anna and Bethke, Felix S. and Biegert, Thomas and Blinzler, Katharina and Blumenberg, Johannes N. and Bobzien, Licia and Bohman, Andrea and Bol, Thijs and Bostic, Amie and Brzozowska, Zuzanna and Burgdorf, Katharina and Burger, Kaspar and Busch, Kathrin B. and {Carlos-Castillo}, Juan and Chan, Nathan and Christmann, Pablo and Connelly, Roxanne and Czymara, Christian S. and Damian, Elena and Ecker, Alejandro and Edelmann, Achim and Eger, Maureen A. and Ellerbrock, Simon and Forke, Anna and Forster, Andrea and Gaasendam, Chris and Gavras, Konstantin and Gayle, Vernon and Gessler, Theresa and Gnambs, Timo and Godefroidt, Am{\'e}lie and Gr{\"o}mping, Max and Gro{\ss}, Martin and Gruber, Stefan and Gummer, Tobias and Hadjar, Andreas and Heisig, Jan Paul and Hellmeier, Sebastian and Heyne, Stefanie and Hirsch, Magdalena and Hjerm, Mikael and Hochman, Oshrat and H{\"o}vermann, Andreas and Hunger, Sophia and Hunkler, Christian and Huth, Nora and Ign{\'a}cz, Zs{\'o}fia S. and Jacobs, Laura and Jacobsen, Jannes and Jaeger, Bastian and Jungkunz, Sebastian and Jungmann, Nils and Kauff, Mathias and Kleinert, Manuel and Klinger, Julia and Kolb, Jan-Philipp and Ko{\l}czy{\'n}ska, Marta and Kuk, John and Kuni{\ss}en, Katharina and Kurti Sinatra, Dafina and Langenkamp, Alexander and Lersch, Philipp M. and L{\"o}bel, Lea-Maria and Lutscher, Philipp and Mader, Matthias and Madia, Joan E. and Malancu, Natalia and Maldonado, Luis and Marahrens, Helge and Martin, Nicole and Martinez, Paul and Mayerl, Jochen and Mayorga, Oscar J. and McManus, Patricia and McWagner, Kyle and Meeusen, Cecil and Meierrieks, Daniel and Mellon, Jonathan and Merhout, Friedolin and Merk, Samuel and Meyer, Daniel and Micheli, Leticia and Mijs, Jonathan and Moya, Crist{\'o}bal and Neunhoeffer, Marcel and N{\"u}st, Daniel and Nyg{\aa}rd, Olav and Ochsenfeld, Fabian and Otte, Gunnar and Pechenkina, Anna O. and Prosser, Christopher and Raes, Louis and Ralston, Kevin and Ramos, Miguel R. and Roets, Arne and Rogers, Jonathan and Ropers, Guido and Samuel, Robin and Sand, Gregor and Schachter, Ariela and Schaeffer, Merlin and Schieferdecker, David and Schlueter, Elmar and Schmidt, Regine and Schmidt, Katja M. and {Schmidt-Catran}, Alexander and Schmiedeberg, Claudia and Schneider, J{\"u}rgen and Schoonvelde, Martijn and {Schulte-Cloos}, Julia and Schumann, Sandy and Schunck, Reinhard and Schupp, J{\"u}rgen and Seuring, Julian and Silber, Henning and Sleegers, Willem and Sonntag, Nico and Staudt, Alexander and Steiber, Nadia and Steiner, Nils and Sternberg, Sebastian and Stiers, Dieter and Stojmenovska, Dragana and Storz, Nora and Striessnig, Erich and Stroppe, Anne-Kathrin and Teltemann, Janna and Tibajev, Andrey and Tung, Brian and Vagni, Giacomo and Van Assche, Jasper and {van der Linden}, Meta and {van der Noll}, Jolanda and Van Hootegem, Arno and Vogtenhuber, Stefan and Voicu, Bogdan and Wagemans, Fieke and Wehl, Nadja and Werner, Hannah and Wiernik, Brenton M. and Winter, Fabian and Wolf, Christof and Yamada, Yuki and Zhang, Nan and Ziller, Conrad and Zins, Stefan and {\.Z}{\'o}{\l}tak, Tomasz},
  year = {2022},
  month = nov,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {119},
  number = {44},
  pages = {e2203150119},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.2203150119},
  urldate = {2024-12-15},
  abstract = {This study explores how researchers' analytical choices affect the reliability of scientific findings. Most discussions of reliability problems in science focus on systematic biases. We broaden the lens to emphasize the idiosyncrasy of conscious and unconscious decisions that researchers make during data analysis. We coordinated 161 researchers in 73 research teams and observed their research decisions as they used the same data to independently test the same prominent social science hypothesis: that greater immigration reduces support for social policies among the public. In this typical case of social science research, research teams reported both widely diverging numerical findings and substantive conclusions despite identical start conditions. Researchers' expertise, prior beliefs, and expectations barely predict the wide variation in research outcomes. More than 95\% of the total variance in numerical results remains unexplained even after qualitative coding of all identifiable decisions in each team's workflow. This reveals a universe of uncertainty that remains hidden when considering a single study in isolation. The idiosyncratic nature of how researchers' results and conclusions varied is a previously underappreciated explanation for why many scientific hypotheses remain contested. These results call for greater epistemic humility and clarity in reporting scientific findings.},
  file = {/home/michaelb/Zotero/storage/5WU4WFFE/Breznau et al. - 2022 - Observing many researchers using the same data and hypothesis reveals a hidden universe of uncertain.pdf}
}

@article{briggsPartialSolutionReplication2023,
  title = {A Partial Solution for the Replication Crisis in Economics},
  author = {Briggs, William M.},
  year = {2023},
  month = jun,
  journal = {Asian Journal of Economics and Banking},
  volume = {7},
  number = {2},
  pages = {180--190},
  issn = {2615-9821},
  doi = {10.1108/AJEB-03-2023-0027},
  urldate = {2025-08-23},
  abstract = {Important research once thought unassailable has failed to replicate. Not just in economics, but in all science. The problem is therefore not in dispute nor are some of the causes, like low power, selective reporting, the file drawer effect, publicly unavailable data and so forth. Some partially worthy solutions have already been offered, like pre-registering hypotheses and data analysis plans.This is a review paper on the replication crisis, which is by now very well known.This study offers another partial solution, which is to remind researchers that correlation does not logically imply causation. The effect of this reminder is to eschew ``significance'' testing, whether in frequentist or Bayesian form (like Bayes factors) and to report models in predictive form, so that anybody can check the veracity of any model. In effect, all papers could undergo replication testing. The author argues that this, or any solution, will never eliminate all errors.},
  file = {/home/michaelb/Zotero/storage/5XYVSZIA/Briggs - 2023 - A partial solution for the replication crisis in economics.pdf;/home/michaelb/Zotero/storage/SXWDZG9E/AJEB-03-2023-0027.html}
}

@misc{britannicaLinusTorvalds2023,
  title = {Linus {{Torvalds}}},
  author = {Britannica, The Editors of Encyclopaedia},
  year = {2023},
  month = dec,
  journal = {Encyclopedia Britannica},
  urldate = {2024-03-11},
  abstract = {Linus Torvalds, Finnish computer scientist who was the principal force behind the development of the Linux operating system. In 1991 he made the Linux software available for free downloading, and he released the source code, which meant that anyone could modify Linux to suit their own purposes.},
  howpublished = {https://www.britannica.com/biography/Linus-Torvalds},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/LPJ8RFCL/Linus-Torvalds.html}
}

@misc{britannicaLinux2024,
  title = {Linux},
  author = {Britannica, The Editors of Encyclopaedia},
  year = {2024},
  month = mar,
  journal = {Encyclopedia Britannica},
  urldate = {2024-03-11},
  abstract = {Linux, computer operating system created in the early 1990s by Finnish software engineer Linus Torvalds and the Free Software Foundation. Because it is open-source, and thus modifiable for different uses, Linux is popular for systems as diverse as cellular telephones and supercomputers.},
  howpublished = {https://www.britannica.com/technology/Linux},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/NKKAMTZ5/Linux.html}
}

@incollection{brownAttributedEinstein2019,
  title = {Attributed to {{Einstein}}},
  booktitle = {The {{Ultimate Quotable Einstein}}},
  author = {Brown, Rita Mae},
  editor = {Calaprice, Alice},
  year = {2019},
  month = dec,
  pages = {471--486},
  publisher = {Princeton University Press},
  doi = {10.1515/9780691207292-025},
  urldate = {2024-12-11},
  isbn = {978-0-691-20729-2},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/FYEPELPQ/Einstein - 2019 - Attributed to Einstein.pdf}
}

@article{buntValidatingUseLarge2025,
  title = {Validating the Use of Large Language Models for Psychological Text Classification},
  author = {Bunt, Hannah L. and Goddard, Alex and Reader, Tom W. and Gillespie, Alex},
  year = {2025},
  month = feb,
  journal = {Frontiers in Social Psychology},
  volume = {3},
  publisher = {Frontiers},
  issn = {2813-7876},
  doi = {10.3389/frsps.2025.1460277},
  urldate = {2025-03-28},
  langid = {english},
  keywords = {artificial intelligence,big qualitative data,GPT,large language models (LLMs),psychology,text classification,validity}
}

@article{caiFeatureSelectionMachine2018,
  title = {Feature Selection in Machine Learning: {{A}} New Perspective},
  shorttitle = {Feature Selection in Machine Learning},
  author = {Cai, Jie and Luo, Jiawei and Wang, Shulin and Yang, Sheng},
  year = {2018},
  month = jul,
  journal = {Neurocomputing},
  volume = {300},
  pages = {70--79},
  issn = {0925-2312},
  doi = {10.1016/j.neucom.2017.11.077},
  urldate = {2025-07-25},
  abstract = {High-dimensional data analysis is a challenge for researchers and engineers in the fields of machine learning and data mining. Feature selection provides an effective way to solve this problem by removing irrelevant and redundant data, which can reduce computation time, improve learning accuracy, and facilitate a better understanding for the learning model or data. In this study, we discuss several frequently-used evaluation measures for feature selection, and then survey supervised, unsupervised, and semi-supervised feature selection methods, which are widely applied in machine learning problems, such as classification and clustering. Lastly, future challenges about feature selection are discussed.},
  keywords = {Data mining,Dimensionality reduction,Feature selection,Machine learning},
  file = {/home/michaelb/Zotero/storage/VXF6ZPDH/S0925231218302911.html}
}

@article{callawayReportFindsMassive2011,
  title = {Report Finds Massive Fraud at {{Dutch}} Universities},
  author = {Callaway, Ewen},
  year = {2011},
  month = nov,
  journal = {Nature},
  volume = {479},
  number = {7371},
  pages = {15--15},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/479015a},
  urldate = {2025-08-23},
  abstract = {Investigation claims dozens of social-psychology papers contain faked data.},
  copyright = {2011 Springer Nature Limited},
  langid = {english},
  keywords = {Journalism,Social sciences},
  file = {/home/michaelb/Zotero/storage/DUXLAMET/Callaway - 2011 - Report finds massive fraud at Dutch universities.pdf;/home/michaelb/Zotero/storage/9ZB29E4V/479015a.html}
}

@misc{cernBirthWebCERN,
  title = {The Birth of the {{Web}} {\textbar} {{CERN}}},
  author = {{CERN}},
  journal = {CERN - The birth of the Web},
  urldate = {2024-03-11},
  howpublished = {https://home.cern/science/computing/birth-web},
  file = {/home/michaelb/Zotero/storage/T7US7PIX/birth-web.html}
}

@misc{cernCerninfochTimBernersLees,
  title = {Cern.Info.Ch - {{Tim Berners-Lee}}'s Proposal},
  author = {{CERN}},
  urldate = {2024-03-11},
  howpublished = {https://info.cern.ch/Proposal.html},
  file = {/home/michaelb/Zotero/storage/ULVHPAES/Proposal.html}
}

@article{chanMitigatingMulticollinearityProblem2022,
  title = {Mitigating the {{Multicollinearity Problem}} and {{Its Machine Learning Approach}}: {{A Review}}},
  shorttitle = {Mitigating the {{Multicollinearity Problem}} and {{Its Machine Learning Approach}}},
  author = {Chan, Jireh Yi-Le and Leow, Steven Mun Hong and Bea, Khean Thye and Cheng, Wai Khuen and Phoong, Seuk Wai and Hong, Zeng-Wei and Chen, Yen-Lin},
  year = {2022},
  month = jan,
  journal = {Mathematics},
  volume = {10},
  number = {8},
  pages = {1283},
  publisher = {Multidisciplinary Digital Publishing Institute},
  issn = {2227-7390},
  doi = {10.3390/math10081283},
  urldate = {2025-07-28},
  abstract = {Technologies have driven big data collection across many fields, such as genomics and business intelligence. This results in a significant increase in variables and data points (observations) collected and stored. Although this presents opportunities to better model the relationship between predictors and the response variables, this also causes serious problems during data analysis, one of which is the multicollinearity problem. The two main approaches used to mitigate multicollinearity are variable selection methods and modified estimator methods. However, variable selection methods may negate efforts to collect more data as new data may eventually be dropped from modeling, while recent studies suggest that optimization approaches via machine learning handle data with multicollinearity better than statistical estimators. Therefore, this study details the chronological developments to mitigate the effects of multicollinearity and up-to-date recommendations to better mitigate multicollinearity.},
  copyright = {http://creativecommons.org/licenses/by/3.0/},
  langid = {english},
  keywords = {machine learning,multicollinearity,neural network,optimization approaches,variable selection methods},
  file = {/home/michaelb/Zotero/storage/6VE6XGJU/Chan et al. - 2022 - Mitigating the Multicollinearity Problem and Its Machine Learning Approach A Review.pdf}
}

@inproceedings{Chen:2016:XST:2939672.2939785,
  title = {{{XGBoost}}: A Scalable Tree Boosting System},
  booktitle = {Proceedings of the 22nd {{ACM SIGKDD}} International Conference on Knowledge Discovery and Data Mining},
  author = {Chen, Tianqi and Guestrin, Carlos},
  year = {2016},
  series = {Kdd '16},
  pages = {785--794},
  publisher = {ACM},
  address = {San Francisco, California, USA and New York, NY, USA},
  doi = {10.1145/2939672.2939785},
  acmid = {2939785},
  isbn = {978-1-4503-4232-2},
  keywords = {large-scale machine learning}
}

@inproceedings{chenXGBoostScalableTree2016,
  title = {{{XGBoost}}: {{A Scalable Tree Boosting System}}},
  shorttitle = {{{XGBoost}}},
  booktitle = {Proceedings of the 22nd {{ACM SIGKDD International Conference}} on {{Knowledge Discovery}} and {{Data Mining}}},
  author = {Chen, Tianqi and Guestrin, Carlos},
  year = {2016},
  month = aug,
  series = {{{KDD}} '16},
  pages = {785--794},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/2939672.2939785},
  urldate = {2025-08-01},
  abstract = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
  isbn = {978-1-4503-4232-2},
  file = {/home/michaelb/Zotero/storage/LV7RCNFL/Chen and Guestrin - 2016 - XGBoost A Scalable Tree Boosting System.pdf}
}

@article{chermakOpenSourceResearchCriminology2025,
  title = {Open-{{Source Research}} in {{Criminology}} and {{Criminal Justice}}},
  author = {Chermak, Steven M. and Freilich, Joshua D. and {Greene-Colozzi}, Emily and Klein, Brent R.},
  year = {2025},
  month = jan,
  journal = {Annual Review of Criminology},
  volume = {8},
  number = {Volume 8, 2025},
  pages = {141--170},
  publisher = {Annual Reviews},
  issn = {2572-4568},
  doi = {10.1146/annurev-criminol-022422-013842},
  urldate = {2025-03-28},
  abstract = {This review focuses on the use of open-source data in criminology and criminal justice research, highlighting the field\&apos;s advancements through these data, optimal practices for constructing open-source databases, and key methodological hurdles to confront. As the amount and types of available public information have grown, scholars have capitalized on this access by constructing open-source databases. Our review found extraordinary growth in this research area and that these flexible methods have been used to study a range of important topics, including issues that have been historically challenging to research. These methods have been most impactful in the study of rare events, such as school shootings, terrorism, and mass shootings. Some studies have become core works that significantly impacted criminology and other scientific disciplines, and the limits of the use of sources have yet to be determined. Our review of this literature found variations in the methodological approach to constructing such databases. Many studies did not evaluate the credibility of the open-source information they relied upon and often were not transparent in describing their research process. We identify the different processual elements of systematically developing and using such data. We highlight the strengths and weaknesses of these methods, set forth best practices, and discuss how to improve methodological rigor and oversight in future research.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/LI7DBAJ3/Chermak et al. - 2025 - Open-Source Research in Criminology and Criminal Justice.pdf}
}

@article{chiccoAdvantagesMatthewsCorrelation2020,
  title = {The Advantages of the {{Matthews}} Correlation Coefficient ({{MCC}}) over {{F1}} Score and Accuracy in Binary Classification Evaluation},
  author = {Chicco, Davide and Jurman, Giuseppe},
  year = {2020},
  month = jan,
  journal = {BMC Genomics},
  volume = {21},
  number = {1},
  pages = {6},
  issn = {1471-2164},
  doi = {10.1186/s12864-019-6413-7},
  urldate = {2025-08-22},
  abstract = {To evaluate binary classifications and their confusion matrices, scientific researchers can employ several statistical rates, accordingly to the goal of the experiment they are investigating. Despite being a crucial issue in machine learning, no widespread consensus has been reached on a unified elective chosen measure yet. Accuracy and F1 score computed on confusion matrices have been (and still are) among the most popular adopted metrics in binary classification tasks. However, these statistical measures can dangerously show overoptimistic inflated results, especially on imbalanced datasets.},
  langid = {english},
  keywords = {Accuracy,Binary classification,Biostatistics,Confusion matrices,Dataset imbalance,F1 score,Genomics,Machine learning,Matthews correlation coefficient},
  file = {/home/michaelb/Zotero/storage/RC2EC7K5/Chicco and Jurman - 2020 - The advantages of the Matthews correlation coefficient (MCC) over F1 score and accuracy in binary cl.pdf}
}

@article{chickSequentialSamplingEconomics2012,
  title = {Sequential {{Sampling}} with {{Economics}} of {{Selection Procedures}}},
  author = {Chick, Stephen E. and Frazier, Peter},
  year = {2012},
  month = mar,
  journal = {Management Science},
  volume = {58},
  number = {3},
  pages = {550--569},
  publisher = {INFORMS},
  issn = {0025-1909},
  doi = {10.1287/mnsc.1110.1425},
  urldate = {2025-08-01},
  abstract = {Sequential sampling problems arise in stochastic simulation and many other applications. Sampling is used to infer the unknown performance of several alternatives before one alternative is selected as best. This paper presents new economically motivated fully sequential sampling procedures to solve such problems, called economics of selection procedures. The optimal procedure is derived for comparing a known standard with one alternative whose unknown reward is inferred with sampling. That result motivates heuristics when multiple alternatives have unknown rewards. The resulting procedures are more effective in numerical experiments than any previously proposed procedure of which we are aware and are easily implemented. The key driver of the improvement is the use of dynamic programming to model sequential sampling as an option to learn before selecting an alternative. It accounts for the expected benefit of adaptive stopping policies for sampling, rather than of one-stage policies, as is common in the literature. This paper was accepted by Assaf Zeevi, stochastic models and simulation.},
  keywords = {Bayesian,decision analysis,diffusion,dynamic programming,probability,simulation,statistical analysis},
  file = {/home/michaelb/Zotero/storage/WTHGYDJV/Chick and Frazier - 2012 - Sequential Sampling with Economics of Selection Procedures.pdf}
}

@manual{chingQs2EfficientSerialization2025,
  type = {Manual},
  title = {Qs2: {{Efficient}} Serialization of {{R}} Objects},
  author = {Ching, Travers},
  year = {2025}
}

@article{chinQuestionableResearchPractices2023,
  title = {Questionable {{Research Practices}} and {{Open Science}} in {{Quantitative Criminology}}},
  author = {Chin, Jason M. and Pickett, Justin T. and Vazire, Simine and Holcombe, Alex O.},
  year = {2023},
  month = mar,
  journal = {Journal of Quantitative Criminology},
  volume = {39},
  number = {1},
  pages = {21--51},
  issn = {1573-7799},
  doi = {10.1007/s10940-021-09525-6},
  urldate = {2024-11-06},
  abstract = {Questionable research practices (QRPs) lead to incorrect research results and contribute to irreproducibility in science. Researchers and institutions have proposed open science practices (OSPs) to improve the detectability of QRPs and the credibility of science. We examine the prevalence of QRPs and OSPs in criminology, and researchers' opinions of those practices.},
  langid = {english},
  keywords = {Meta-research,Open science,Questionable research practices,Reproducibility},
  file = {/home/michaelb/Zotero/storage/N9HQXU5H/Chin et al. - 2023 - Questionable Research Practices and Open Science in Quantitative Criminology.pdf}
}

@article{chinQuestionableResearchPractices2023b,
  title = {Questionable {{Research Practices}} and {{Open Science}} in {{Quantitative Criminology}}},
  author = {Chin, Jason M. and Pickett, Justin T. and Vazire, Simine and Holcombe, Alex O.},
  year = {2023},
  month = mar,
  journal = {Journal of Quantitative Criminology},
  volume = {39},
  number = {1},
  pages = {21--51},
  issn = {1573-7799},
  doi = {10.1007/s10940-021-09525-6},
  urldate = {2025-08-26},
  abstract = {Questionable research practices (QRPs) lead to incorrect research results and contribute to irreproducibility in science. Researchers and institutions have proposed open science practices (OSPs) to improve the detectability of QRPs and the credibility of science. We examine the prevalence of QRPs and OSPs in criminology, and researchers' opinions of those practices.},
  langid = {english},
  keywords = {Meta-research,Open science,Questionable research practices,Reproducibility},
  file = {/home/michaelb/Zotero/storage/9TGF3SAN/Chin et al. - 2023 - Questionable Research Practices and Open Science in Quantitative Criminology.pdf}
}

@article{claesenComparingDreamReality2021,
  title = {Comparing Dream to Reality: An Assessment of Adherence of the First Generation of Preregistered Studies},
  shorttitle = {Comparing Dream to Reality},
  author = {Claesen, Aline and Gomes, Sara and Tuerlinckx, Francis and Vanpaemel, Wolf},
  year = {2021},
  month = oct,
  journal = {Royal Society Open Science},
  volume = {8},
  number = {10},
  pages = {211037},
  publisher = {Royal Society},
  doi = {10.1098/rsos.211037},
  urldate = {2024-11-06},
  abstract = {Preregistration is a method to increase research transparency by documenting research decisions on a public, third-party repository prior to any influence by data. It is becoming increasingly popular in all subfields of psychology and beyond. Adherence to the preregistration plan may not always be feasible and even is not necessarily desirable, but without disclosure of deviations, readers who do not carefully consult the preregistration plan might get the incorrect impression that the study was exactly conducted and reported as planned. In this paper, we have investigated adherence and disclosure of deviations for all articles published with the Preregistered badge in Psychological Science between February 2015 and November 2017 and shared our findings with the corresponding authors for feedback. Two out of 27 preregistered studies contained no deviations from the preregistration plan. In one study, all deviations were disclosed. Nine studies disclosed none of the deviations. We mainly observed (un)disclosed deviations from the plan regarding the reported sample size, exclusion criteria and statistical analysis. This closer look at preregistrations of the first generation reveals possible hurdles for reporting preregistered studies and provides input for future reporting guidelines. We discuss the results and possible explanations, and provide recommendations for preregistered research.},
  keywords = {open science,preregistration,psychological science,researcher degrees of freedom,transparency},
  file = {/home/michaelb/Zotero/storage/V555Q9F6/Claesen et al. - 2021 - Comparing dream to reality an assessment of adherence of the first generation of preregistered stud.pdf}
}

@misc{clarivateJournalImpactFactor2023,
  title = {Journal {{Impact Factor}}},
  author = {{Clarivate}},
  year = {2023},
  publisher = {https://jcr.clarivate.com/jcr/browse-journals},
  urldate = {2024-12-19}
}

@misc{cnnCNNcomReclusiveLinux,
  title = {{{CNN}}.Com - {{Reclusive Linux}} Founder Opens up - {{May}} 18, 2006},
  author = {{CNN}},
  urldate = {2024-03-11},
  howpublished = {https://edition.cnn.com/2006/BUSINESS/05/18/global.office.linustorvalds/}
}

@article{collingStatisticalInferenceReplication2021,
  title = {Statistical {{Inference}} and the {{Replication Crisis}}},
  author = {Colling, Lincoln J. and Sz{\H u}cs, D{\'e}nes},
  year = {2021},
  month = mar,
  journal = {Review of Philosophy and Psychology},
  volume = {12},
  number = {1},
  pages = {121--147},
  issn = {1878-5166},
  doi = {10.1007/s13164-018-0421-4},
  urldate = {2025-08-23},
  abstract = {The replication crisis has prompted many to call for statistical reform within the psychological sciences. Here we examine issues within Frequentist statistics that may have led to the replication crisis, and we examine the alternative---Bayesian statistics---that many have suggested as a replacement. The Frequentist approach and the Bayesian approach offer radically different perspectives on evidence and inference with the Frequentist approach prioritising error control and the Bayesian approach offering a formal method for quantifying the relative strength of evidence for hypotheses. We suggest that rather than mere statistical reform, what is needed is a better understanding of the different modes of statistical inference and a better understanding of how statistical inference relates to scientific inference.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/M8KK6BRB/Colling and Szűcs - 2021 - Statistical Inference and the Replication Crisis.pdf}
}

@manual{comtoisSummarytoolsToolsQuickly2025,
  type = {Manual},
  title = {Summarytools: {{Tools}} to Quickly and Neatly Summarize Data},
  author = {Comtois, Dominic},
  year = {2025}
}

@article{cortesSupportvectorNetworks1995,
  title = {Support-Vector Networks},
  author = {Cortes, Corinna and Vapnik, Vladimir},
  year = {1995},
  month = sep,
  journal = {Machine Learning},
  volume = {20},
  number = {3},
  pages = {273--297},
  issn = {1573-0565},
  doi = {10.1007/BF00994018},
  urldate = {2025-08-01},
  abstract = {Thesupport-vector network is a new learning machine for two-group classification problems. The machine conceptually implements the following idea: input vectors are non-linearly mapped to a very high-dimension feature space. In this feature space a linear decision surface is constructed. Special properties of the decision surface ensures high generalization ability of the learning machine. The idea behind the support-vector network was previously implemented for the restricted case where the training data can be separated without errors. We here extend this result to non-separable training data.},
  langid = {english},
  keywords = {Artificial Intelligence,Bayesian Network,Categorization,Computational Intelligence,efficient learning algorithms,Learning algorithms,Machine Learning,neural networks,pattern recognition,polynomial classifiers,radial basis function classifiers},
  file = {/home/michaelb/Zotero/storage/7QID4ZPN/Cortes and Vapnik - 1995 - Support-vector networks.pdf}
}

@incollection{crespomarquezCurseDimensionality2022,
  title = {The {{Curse}} of {{Dimensionality}}},
  booktitle = {Digital {{Maintenance Management}}: {{Guiding Digital Transformation}} in {{Maintenance}}},
  author = {Crespo M{\'a}rquez, Adolfo},
  editor = {Crespo M{\'a}rquez, Adolfo},
  year = {2022},
  pages = {67--86},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-030-97660-6_7},
  urldate = {2025-07-28},
  abstract = {The curse of dimensionality is a phenomenon that appears in Machine Learning models when algorithms must learn from an ample~feature volume~with abundant values within each one. Reaching samples with each combination of values when training would be very complicated. Thus, it can happen (as it will be appreciated later in this Chapter) that classifier or regress or accuracy first improves including more dimensions but then could even decrease.~This Chapter deals mainly with this problem and to that end several feature selection and feature selection ranking (FSR) methods are considered. These methods are basically algorithms which include wrappers and filters, and they can provide a ranking of all the analyzed features.},
  isbn = {978-3-030-97660-6},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/THHKGSEC/Crespo Márquez - 2022 - The Curse of Dimensionality.pdf}
}

@article{crockerExpandingEarlyLate2018,
  title = {Expanding the Early and Late Starter Model of Criminal Justice Involvement for Forensic Mental Health Clients},
  author = {Crocker, Anne G. and Martin, Michael S. and Leclair, Marichelle C. and Nicholls, Tonia L. and Seto, Michael C.},
  year = {2018},
  journal = {Law and Human Behavior},
  volume = {42},
  number = {1},
  pages = {83--93},
  publisher = {Educational Publishing Foundation},
  address = {US},
  issn = {1573-661X},
  doi = {10.1037/lhb0000269},
  abstract = {The early and late starter model provides one of the most enduring frameworks for understanding the developmental course and severity of violence and criminality among individuals with severe mental illness. We expanded the model to account for differences in the age of onset of criminal behavior and added a group with no prior contact with the justice or mental health systems. We sampled 1,800 men and women found Not Criminally Responsible on account of Mental Disorder in 3 Canadian provinces. Using a retrospective file-based study, we explored differences in criminal, health, demographic, and social functioning characteristics, processing through the forensic psychiatric system and recidivism outcomes of 5 groups. We replicated prior research, finding more typical criminogenic needs among those with early onset crime. Those with crime onset after mental illness were more likely to show fewer criminogenic needs and to have better outcomes upon release than those who had crime onset during adulthood, before mental illness. Individuals with no prior contact with mental health or criminal justice had higher functioning prior to their crime and had a lower risk of reoffending. Given little information is needed to identify the groups, computing the distribution of these groups within forensic mental health services or across services can provide estimates of potential intensity or duration of services that might be needed. This study suggests that distinguishing subgroups of forensic clients based on the sequence of onset of mental illness and criminal behavior and on the age of onset of criminal behavior may be useful to identify criminogenic needs and predict outcomes upon release. This updated framework can be useful for planning organization of services, understanding case mix, as well as patient flow in forensic services and flow of mentally disordered offenders in correctional services. (PsycINFO Database Record (c) 2018 APA, all rights reserved)},
  keywords = {Client Characteristics,Criminal Behavior,Criminal Justice,Criminal Responsibility,Demographic Characteristics,Forensic Psychiatry,Health,Mentally Ill Offenders,Models,Psychodiagnostic Typologies,Recidivism,Social Skills,Violence},
  file = {/home/michaelb/Zotero/storage/S77NZAR5/Crocker et al. - 2018 - Expanding the early and late starter model of criminal justice involvement for forensic mental healt.pdf}
}

@article{crockerRoadFraudStarts2011,
  title = {The Road to Fraud Starts with a Single Step},
  author = {Crocker, Jennifer},
  year = {2011},
  month = nov,
  journal = {Nature},
  volume = {479},
  number = {7372},
  pages = {151--151},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/479151a},
  urldate = {2025-08-23},
  abstract = {The extensive academic fraud of Diederik Stapel has rocked science. Social psychologist Jennifer Crocker traces the destructive path that cheats follow.},
  copyright = {2011 Springer Nature Limited},
  langid = {english},
  keywords = {Social sciences},
  file = {/home/michaelb/Zotero/storage/C95VNMKA/Crocker - 2011 - The road to fraud starts with a single step.pdf}
}

@article{crockerRoadFraudStarts2011a,
  title = {The Road to Fraud Starts with a Single Step},
  author = {Crocker, Jennifer},
  year = {2011},
  month = nov,
  journal = {Nature},
  volume = {479},
  number = {7372},
  pages = {151--151},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/479151a},
  urldate = {2025-08-23},
  abstract = {The extensive academic fraud of Diederik Stapel has rocked science. Social psychologist Jennifer Crocker traces the destructive path that cheats follow.},
  copyright = {2011 Springer Nature Limited},
  langid = {english},
  keywords = {Social sciences},
  file = {/home/michaelb/Zotero/storage/X4ZE77GD/Crocker - 2011 - The road to fraud starts with a single step.pdf}
}

@misc{crossref_metadata,
  title = {Crossref Metadata {{API}}},
  author = {{Crossref}},
  year = {2014},
  publisher = {Crossref}
}

@article{crowWhatsYourFile2023,
  title = {What's in {{Your File Drawer}}? {{The Case}} of the {{Missing Null}} in {{Criminology}} and {{Criminal Justice}}},
  shorttitle = {What's in {{Your File Drawer}}?},
  author = {Crow, Matthew S. and Smykla, John Ortiz and O'Brien, Haydon and Cerna, Tori and Johnson, Alexander and Pisaris, Sarah and Suarez, Mariana and Wilder, Jordyn},
  year = {2023},
  month = nov,
  journal = {Crime \& Delinquency},
  volume = {69},
  number = {12},
  pages = {2574--2594},
  publisher = {SAGE Publications Inc},
  issn = {0011-1287},
  doi = {10.1177/00111287221090959},
  urldate = {2025-08-26},
  abstract = {Analysis of scholarship in the physical, biological, and social sciences has discovered that peer-reviewed journals publish a much larger proportion of articles with statistically significant findings compared to articles with null results. Publication bias in criminology and criminal justice (CCJ) has received very little attention, however. The current study is an exploratory analysis of research in leading CCJ journals across 2 years to determine the current state of null findings in contemporary CCJ scholarship. Our findings are consistent with studies in other disciplines; null results are rare in leading CCJ journals. We explore the context of our findings, outline the importance of examining publication bias to improve CCJ research and better inform policy, and discuss the limitations of our approach.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/BQEL6D79/Crow et al. - 2023 - What’s in Your File Drawer The Case of the Missing Null in Criminology and Criminal Justice.pdf}
}

@article{cruwellSevenEasySteps2019,
  title = {Seven {{Easy Steps}} to {{Open Science}}},
  author = {Cr{\"u}well, Sophia and van Doorn, Johnny and Etz, Alexander and Makel, Matthew C. and Moshontz, Hannah and Niebaum, Jesse C. and Orben, Amy and Parsons, Sam and {Schulte-Mecklenbeck}, Michael},
  year = {2019},
  month = dec,
  journal = {Zeitschrift f{\"u}r Psychologie},
  publisher = {Hogrefe Publishing},
  issn = {2151-2604},
  urldate = {2025-08-28},
  abstract = {Abstract. The open science movement is rapidly changing the scientific landscape. Because exact definitions are often lacking and reforms are constantly evolving, accessible guides to open science ...},
  copyright = {Distributed as a Hogrefe OpenMind article under the license CC BY 4.0 (https://creativecommons.org/licenses/by/4.0)},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/CBBSESPG/a000387.html}
}

@manual{csardiCrayonColoredTerminal2024,
  type = {Manual},
  title = {Crayon: {{Colored}} Terminal Output},
  author = {Cs{\'a}rdi, G{\'a}bor},
  year = {2024}
}

@inproceedings{davisRelationshipPrecisionRecallROC2006a,
  title = {The Relationship between {{Precision-Recall}} and {{ROC}} Curves},
  booktitle = {Proceedings of the 23rd International Conference on {{Machine}} Learning  - {{ICML}} '06},
  author = {Davis, Jesse and Goadrich, Mark},
  year = {2006},
  pages = {233--240},
  publisher = {ACM Press},
  address = {Pittsburgh, Pennsylvania},
  doi = {10.1145/1143844.1143874},
  urldate = {2025-08-22},
  copyright = {https://www.acm.org/publications/policies/copyright\_policy\#Background},
  isbn = {978-1-59593-383-6},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/M6PLLYIQ/Davis and Goadrich - 2006 - The relationship between Precision-Recall and ROC curves.pdf}
}

@article{delgado-quirosWhyAreThese2024,
  title = {Why Are These Publications Missing? {{Uncovering}} the Reasons behind the Exclusion of Documents in Free-Access Scholarly Databases},
  shorttitle = {Why Are These Publications Missing?},
  author = {{Delgado-Quir{\'o}s}, Lorena and Aguillo, Isidro F. and {Mart{\'i}n-Mart{\'i}n}, Alberto and {L{\'o}pez-C{\'o}zar}, Emilio Delgado and {Ordu{\~n}a-Malea}, Enrique and Ortega, Jos{\'e} Luis},
  year = {2024},
  journal = {Journal of the Association for Information Science and Technology},
  volume = {75},
  number = {1},
  pages = {43--58},
  issn = {2330-1643},
  doi = {10.1002/asi.24839},
  urldate = {2025-07-15},
  abstract = {This study analyses the coverage of seven free-access bibliographic databases (Crossref, Dimensions---non-subscription version, Google Scholar, Lens, Microsoft Academic, Scilit, and Semantic Scholar) to identify the potential reasons that might cause the exclusion of scholarly documents and how they could influence coverage. To do this, 116 k randomly selected bibliographic records from Crossref were used as a baseline. API endpoints and web scraping were used to query each database. The results show that coverage differences are mainly caused by the way each service builds their databases. While classic bibliographic databases ingest almost the exact same content from Crossref (Lens and Scilit miss 0.1\% and 0.2\% of the records, respectively), academic search engines present lower coverage (Google Scholar does not find: 9.8\%, Semantic Scholar: 10\%, and Microsoft Academic: 12\%). Coverage differences are mainly attributed to external factors, such as web accessibility and robot exclusion policies (39.2\%--46\%), and internal requirements that exclude secondary content (6.5\%--11.6\%). In the case of Dimensions, the only classic bibliographic database with the lowest coverage (7.6\%), internal selection criteria such as the indexation of full books instead of book chapters (65\%) and the exclusion of secondary content (15\%) are the main motives of missing publications.},
  copyright = {{\copyright} 2023 The Authors. Journal of the Association for Information Science and Technology published by Wiley Periodicals LLC on behalf of Association for Information Science and Technology.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/FH9ESWAL/Delgado-Quirós et al. - 2024 - Why are these publications missing Uncovering the reasons behind the exclusion of documents in free.pdf;/home/michaelb/Zotero/storage/98TP82S8/asi.html}
}

@misc{dennisTimBernersLee2023,
  title = {Tim {{Berners-Lee}}},
  author = {Dennis, Michael Aaron},
  year = {2023},
  month = dec,
  journal = {Encyclopedia Britannica},
  urldate = {2024-03-11},
  abstract = {Tim Berners-Lee, British computer scientist, generally credited as the inventor of the World Wide Web. In 2004 he was knighted by Queen Elizabeth II and received the Millennium Technology Prize from the Finnish Technology Award Foundation. In 2007 he was awarded the Draper Prize by the National Academy of Engineering.},
  howpublished = {https://www.britannica.com/biography/Tim-Berners-Lee},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/EACBMKS2/Tim-Berners-Lee.html}
}

@article{dickelDigitaleInklusionZur2015,
  title = {{Digitale Inklusion: Zur sozialen {\"O}ffnung des Wissenschaftssystems / Digital Inclusion: The Social Implications of Open Science}},
  shorttitle = {{Digitale Inklusion}},
  author = {Dickel, Sascha and Franzen, Martina},
  year = {2015},
  month = oct,
  journal = {Zeitschrift f{\"u}r Soziologie},
  volume = {44},
  number = {5},
  pages = {330--347},
  publisher = {De Gruyter Oldenbourg},
  issn = {2366-0325},
  doi = {10.1515/zfsoz-2015-0503},
  urldate = {2024-12-15},
  abstract = {From the perspective of systems theory, science is a prototype of a self-referential functional system that maintains social distance to the public. In functionally differentiated societies, science maintains a strict regime of inclusion, which is closely tied to the professional role of the scientist as someone who produces and acquires knowledge. We suggest that the digital revolution is generating novel modes of inclusion. These take the form of functionalized subroles in which the professional role of the scientist is disassembled. By proposing a socio-theoretically informed characterization of these new modes of inclusion we aim to meet two different goals: The first is to overcome the theoretical conservatism of differentiation theory, in which diagnoses of the social openness of science are solely interpreted as semantic surface phenomena. The second is to achieve analytical distance to a societal discourse that describes these new modes of inclusion as examples of a successful democratization of science.},
  copyright = {De Gruyter expressly reserves the right to use all content for commercial text and data mining within the meaning of Section 44b of the German Copyright Act.},
  langid = {ngerman},
  keywords = {Inclusion: Citizen Science,Societal Differentiation,Sociology of Science,Web 2.0},
  file = {/home/michaelb/Zotero/storage/UXJ5I59H/Dickel and Franzen - 2015 - Digitale Inklusion Zur sozialen Öffnung des Wissenschaftssystems  Digital Inclusion The Social Im.pdf}
}

@book{diekmannEmpirischeSozialforschungGrundlagen2022,
  title = {{Empirische Sozialforschung: Grundlagen, Methoden, Anwendungen}},
  shorttitle = {{Empirische Sozialforschung}},
  author = {Diekmann, Andreas},
  year = {2022},
  series = {{rowohlts enzyklop{\"a}die}},
  edition = {15. Auflage, vollst{\"a}ndig {\"u}berarbeitete und erweiterte Neuausgabe August 2007, Originalausgabe},
  publisher = {Rowohlt Taschenbuch Verlag},
  address = {Reinbek bei Hamburg},
  isbn = {978-3-499-55678-4},
  langid = {german}
}

@inbook{diekmannII2Probleme2022,
  title = {{II. 2. Probleme der Pr{\"u}fung von Hypothesen}},
  booktitle = {{Empirische Sozialforschung: Grundlagen, Methoden, Anwendungen}},
  author = {Diekmann, Andreas},
  year = {2022},
  series = {{rowohlts enzyklop{\"a}die}},
  edition = {15. Auflage, vollst{\"a}ndig {\"u}berarbeitete und erweiterte Neuausgabe August 2007, Originalausgabe},
  publisher = {Rowohlt Taschenbuch Verlag},
  address = {Reinbek bei Hamburg},
  collaborator = {Diekmann, Andreas},
  isbn = {978-3-499-55678-4},
  langid = {german}
}

@article{dienlinAgendaOpenScience2021,
  title = {An {{Agenda}} for {{Open Science}} in {{Communication}}},
  author = {Dienlin, Tobias and Johannes, Niklas and Bowman, Nicholas David and Masur, Philipp K and Engesser, Sven and K{\"u}mpel, Anna Sophie and Lukito, Josephine and Bier, Lindsey M and Zhang, Renwen and Johnson, Benjamin K and Huskey, Richard and Schneider, Frank M and Breuer, Johannes and Parry, Douglas A and Vermeulen, Ivar and Fisher, Jacob T and Banks, Jaime and Weber, Ren{\'e} and Ellis, David A and Smits, Tim and Ivory, James D and Trepte, Sabine and McEwan, Bree and Rinke, Eike Mark and Neubaum, German and Winter, Stephan and Carpenter, Christopher J and Kr{\"a}mer, Nicole and Utz, Sonja and Unkel, Julian and Wang, Xiaohui and Davidson, Brittany I and Kim, Nuri and Won, Andrea Stevenson and Domahidi, Emese and Lewis, Neil A and {de Vreese}, Claes},
  year = {2021},
  month = feb,
  journal = {Journal of Communication},
  volume = {71},
  number = {1},
  pages = {1--26},
  issn = {0021-9916},
  doi = {10.1093/joc/jqz052},
  urldate = {2024-12-16},
  abstract = {In the last 10 years, many canonical findings in the social sciences appear unreliable. This so-called ``replication crisis'' has spurred calls for open science practices, which aim to increase the reproducibility, replicability, and generalizability of findings. Communication research is subject to many of the same challenges that have caused low replicability in other fields. As a result, we propose an agenda for adopting open science practices in Communication, which includes the following seven suggestions: (1) publish materials, data, and code; (2) preregister studies and submit registered reports; (3) conduct replications; (4) collaborate; (5) foster open science skills; (6) implement Transparency and Openness Promotion Guidelines; and (7) incentivize open science practices. Although in our agenda we focus mostly on quantitative research, we also reflect on open science practices relevant to qualitative research. We conclude by discussing potential objections and concerns associated with open science practices.},
  file = {/home/michaelb/Zotero/storage/GH7PZSVG/Dienlin et al. - 2021 - An Agenda for Open Science in Communication.pdf;/home/michaelb/Zotero/storage/FUUT9S83/5803422.html}
}

@article{dollBayesianModelSelection2019,
  title = {Bayesian {{Model Selection}} in {{Fisheries Management}} and {{Ecology}}},
  author = {Doll, Jason C. and Jacquemin, Stephen J.},
  year = {2019},
  month = sep,
  journal = {Journal of Fish and Wildlife Management},
  volume = {10},
  number = {2},
  pages = {691--707},
  issn = {1944-687X},
  doi = {10.3996/042019-JFWM-024},
  urldate = {2024-12-13},
  abstract = {Researchers often test ecological hypotheses relating to a myriad of questions ranging from assemblage structure, population dynamics, demography, abundance, growth rate, and more using mathematical models that explain trends in data. To aid in the evaluation process when faced with competing hypotheses, we employ statistical methods to evaluate the validity of these multiple hypotheses with the goal of deriving the most robust conclusions possible. In fisheries management and ecology, frequentist methodologies have largely dominated this approach. However, in recent years, researchers have increasingly used Bayesian inference methods to estimate model parameters. Our aim with this perspective is to provide the practicing fisheries ecologist with an accessible introduction to Bayesian model selection. Here we discuss Bayesian inference methods for model selection in the context of fisheries management and ecology with empirical examples to guide researchers in the use of these methods. In this perspective we discuss three methods for selecting among competing models. For comparing two models we discuss Bayes factor and for more complex models we discuss Watanabe--Akaike information criterion and leave-one-out cross-validation. We also describe what kinds of information to report when conducting Bayesian inference. We conclude this review with a discussion of final thoughts about these model selection techniques.},
  file = {/home/michaelb/Zotero/storage/Y57Q6CN3/Doll and Jacquemin - 2019 - Bayesian Model Selection in Fisheries Management and Ecology.pdf}
}

@article{domingosFewUsefulThings2012,
  title = {A Few Useful Things to Know about Machine Learning},
  author = {Domingos, Pedro},
  year = {2012},
  month = oct,
  journal = {Communications of the ACM},
  volume = {55},
  number = {10},
  pages = {78--87},
  issn = {0001-0782, 1557-7317},
  doi = {10.1145/2347736.2347755},
  urldate = {2025-07-28},
  abstract = {Tapping into the "folk knowledge" needed to advance machine learning applications.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/WPAVWG3G/Domingos - 2012 - A few useful things to know about machine learning.pdf}
}

@article{domingosOptimalitySimpleBayesian1997,
  title = {On the {{Optimality}} of the {{Simple Bayesian Classifier}} under {{Zero-One Loss}}},
  author = {Domingos, Pedro and Pazzani, Michael},
  year = {1997},
  month = nov,
  journal = {Machine Learning},
  volume = {29},
  number = {2},
  pages = {103--130},
  issn = {1573-0565},
  doi = {10.1023/A:1007413511361},
  urldate = {2025-08-01},
  abstract = {The simple Bayesian classifier is known to be optimal when attributes are independent given the class, but the question of whether other sufficient conditions for its optimality exist has so far not been explored. Empirical results showing that it performs surprisingly well in many domains containing clear attribute dependences suggest that the answer to this question may be positive. This article shows that, although the Bayesian classifier's probability estimates are only optimal under quadratic loss if the independence assumption holds, the classifier itself can be optimal under zero-one loss (misclassification rate) even when this assumption is violated by a wide margin. The region of quadratic-loss optimality of the Bayesian classifier is in fact a second-order infinitesimal fraction of the region of zero-one optimality. This implies that the Bayesian classifier has a much greater range of applicability than previously thought. For example, in this article it is shown to be optimal for learning conjunctions and disjunctions, even though they violate the independence assumption. Further, studies in artificial domains show that it will often outperform more powerful classifiers for common training set sizes and numbers of attributes, even if its bias is a priori much less appropriate to the domain. This article's results also imply that detecting attribute dependence is not necessarily the best way to extend the Bayesian classifier, and this is also verified empirically.},
  langid = {english},
  keywords = {Bayesian Inference,Bayesian Network,Categorization,induction with attribute dependences,Learning algorithms,Machine Learning,naive Bayesian classifier,optimal classification,Simple Bayesian classifier,Statistical Learning,zero-one loss},
  file = {/home/michaelb/Zotero/storage/32SGVIP8/Domingos and Pazzani - 1997 - On the Optimality of the Simple Bayesian Classifier under Zero-One Loss.pdf}
}

@article{doyenBehavioralPrimingIts2012,
  title = {Behavioral {{Priming}}: {{It}}'s {{All}} in the {{Mind}}, but {{Whose Mind}}?},
  shorttitle = {Behavioral {{Priming}}},
  author = {Doyen, St{\'e}phane and Klein, Olivier and Pichon, Cora-Lise and Cleeremans, Axel},
  year = {2012},
  month = jan,
  journal = {PLoS ONE},
  volume = {7},
  number = {1},
  pages = {e29081},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0029081},
  urldate = {2025-08-23},
  abstract = {The perspective that behavior is often driven by unconscious determinants has become widespread in social psychology. Bargh, Chen, and Burrows' (1996) famous study, in which participants unwittingly exposed to the stereotype of age walked slower when exiting the laboratory, was instrumental in defining this perspective. Here, we present two experiments aimed at replicating the original study. Despite the use of automated timing methods and a larger sample, our first experiment failed to show priming. Our second experiment was aimed at manipulating the beliefs of the experimenters: Half were led to think that participants would walk slower when primed congruently, and the other half was led to expect the opposite. Strikingly, we obtained a walking speed effect, but only when experimenters believed participants would indeed walk slower. This suggests that both priming and experimenters' expectations are instrumental in explaining the walking speed effect. Further, debriefing was suggestive of awareness of the primes. We conclude that unconscious behavioral priming is real, while real, involves mechanisms different from those typically assumed to cause the effect.},
  pmcid = {PMC3261136},
  pmid = {22279526},
  file = {/home/michaelb/Zotero/storage/NIAWLJ4F/Doyen et al. - 2012 - Behavioral Priming It's All in the Mind, but Whose Mind.pdf}
}

@misc{dunleavyUseMisuseClassical2021,
  title = {The {{Use}} and {{Misuse}} of {{Classical Statistics}}: {{A Primer}} for {{Social Workers}}},
  author = {Dunleavy, Daniel J. and Lacasse, Jeffrey R.},
  year = {2021},
  urldate = {2024-12-13},
  howpublished = {https://journals.sagepub.com/doi/10.1177/10497315211008247},
  file = {/home/michaelb/Zotero/storage/NI8KLN2F/10497315211008247.html}
}

@incollection{dunnChapter9Models2018,
  title = {Chapter 9: {{Models}} for {{Proportions}}: {{Binomial GLMs}}},
  shorttitle = {Chapter 9},
  booktitle = {Generalized {{Linear Models With Examples}} in {{R}}},
  author = {Dunn, Peter K. and Smyth, Gordon K.},
  editor = {Dunn, Peter K. and Smyth, Gordon K.},
  year = {2018},
  pages = {333--369},
  publisher = {Springer},
  address = {New York, NY},
  doi = {10.1007/978-1-4419-0118-7_9},
  urldate = {2025-09-03},
  abstract = {Chapters~5--8develop the theory of glms~in general. This chapter focuses on one specific glm: the binomial glm. The binomial glm~is the most commonly used of all glms. It is used to model proportions, where the proportions are obtained as the number of `positive' cases out of a total number of independent cases. We first compile important information about the binomial distribution (Sect.\,9.2), then discuss the common link functions used for binomial glms~(Sect.\,9.3), and the threshold interpretation of the link function (Sect.\,9.4). We then discuss model interpretation in terms of odds (Sect.\,9.5), and how binomial glms~can be used to estimate the median effective dose ed50 (Sect.\,9.6).},
  isbn = {978-1-4419-0118-7},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/A3XYBB8I/Dunn and Smyth - 2018 - Chapter 9 Models for Proportions Binomial GLMs.pdf}
}

@manual{eddelbuettelDigestCreateCompact2024,
  type = {Manual},
  title = {Digest: {{Create}} Compact Hash Digests of {{R}} Objects},
  author = {Eddelbuettel, Dirk},
  year = {2024}
}

@article{egbuchulemBASICSSAMPLESIZE2023,
  title = {{{THE BASICS OF SAMPLE SIZE ESTIMATION}}: {{AN EDITOR}}'{{S VIEW}}},
  shorttitle = {{{THE BASICS OF SAMPLE SIZE ESTIMATION}}},
  author = {Egbuchulem, K. I},
  year = {2023},
  month = jun,
  journal = {Annals of Ibadan Postgraduate Medicine},
  volume = {21},
  number = {1},
  pages = {5--10},
  issn = {1597-1627},
  urldate = {2025-08-03},
  pmcid = {PMC10388427},
  pmid = {37528816}
}

@article{eisenbergAddressingZerosProblem2015,
  title = {Addressing the {{Zeros Problem}}: {{Regression Models}} for {{Outcomes}} with a {{Large Proportion}} of {{Zeros}}, with an {{Application}} to {{Trial Outcomes}}},
  shorttitle = {Addressing the {{Zeros Problem}}},
  author = {Eisenberg, Theodore and Eisenberg, Thomas and Wells, Martin T. and Zhang, Min},
  year = {2015},
  journal = {Journal of Empirical Legal Studies},
  volume = {12},
  number = {1},
  pages = {161--186},
  issn = {1740-1461},
  doi = {10.1111/jels.12068},
  urldate = {2025-08-07},
  abstract = {In law-related and other social science contexts, researchers need to account for data with an excess number of zeros. In addition, dollar damages in legal cases also often are skewed. This article reviews various strategies for dealing with this data type. Tobit models are often applied to deal with the excess number of zeros, but these are more appropriate in cases of true censoring (e.g., when all negative values are recorded as zeros) and less appropriate when zeros are in fact often observed as the amount awarded. Heckman selection models are another methodology that is applied in this setting, yet they were developed for potential outcomes rather than actual ones. Two-part models account for actual outcomes and avoid the collinearity problems that often attend selection models. A two-part hierarchical model is developed here that accounts for both the skewed, zero-inflated nature of damages data and the fact that punitive damage awards may be correlated within case type, jurisdiction, or time. Inference is conducted using a Markov chain Monte Carlo sampling scheme. Tobit models, selection models, and two-part models are fit to two punitive damage awards data sets and the results are compared. We illustrate that the nonsignificance of coefficients in a selection model can be a consequence of collinearity, whereas that does not occur with two-part models.},
  copyright = {{\copyright} 2015, Copyright the Authors. Journal compilation {\copyright} 2015, Cornell Law School and Wiley Periodicals, Inc},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/CIRSHF6E/Eisenberg et al. - 2015 - Addressing the Zeros Problem Regression Models for Outcomes with a Large Proportion of Zeros, with.pdf;/home/michaelb/Zotero/storage/ZTVUI5RR/jels.html}
}

@article{eisendInternetNewMedium2002,
  title = {The {{Internet}} as a New Medium for the Sciences? {{The}} Effects of {{Internet}} Use on Traditional Scientific Communication Media among Social Scientists in {{Germany}}},
  shorttitle = {The {{Internet}} as a New Medium for the Sciences?},
  author = {Eisend, Martin},
  year = {2002},
  month = jan,
  journal = {Online Information Review},
  volume = {26},
  number = {5},
  pages = {307--317},
  publisher = {MCB UP Ltd},
  issn = {1468-4527},
  doi = {10.1108/14684520210447877},
  urldate = {2024-12-13},
  abstract = {Scientific communication takes place within two main fields: research and publication. Whereas twentieth century audio-visual media did not become established in the scientific communication system, the Internet, with its variety of communication options, is able to enter both fields of communication and has even revolutionised this communication system to some extent. The investigation of this relationship is based on data from a study of social scientists taken in Berlin in autumn 1999. The Internet substitutes written communication media and complements forms of spoken communication in the field of research. It also complements traditional publisher-oriented forms of publication and is even a substitute for works that have previously avoided publication. Therefore, the Internet should not be regarded as a new alternative to traditional and institutionalised structures of communication of scientific publications, as it has already become institutionalised in the field of research as a medium of interpersonal communication.},
  keywords = {Communications,Electronic publishing,Internet,Publishing},
  file = {/home/michaelb/Zotero/storage/BQ76I2XY/Eisend - 2002 - The Internet as a new medium for the sciences The effects of Internet use on traditional scientific.pdf}
}

@article{evansImprovingEvidencebasedPractice2023,
  title = {Improving Evidence-Based Practice through Preregistration of Applied Research: {{Barriers}} and Recommendations},
  shorttitle = {Improving Evidence-Based Practice through Preregistration of Applied Research},
  author = {Evans, Thomas Rhys and Branney, Peter and Clements, Andrew and Hatton, Ella},
  year = {2023},
  month = feb,
  journal = {Accountability in Research},
  volume = {30},
  number = {2},
  pages = {88--108},
  publisher = {Taylor \& Francis},
  issn = {0898-9621},
  doi = {10.1080/08989621.2021.1969233},
  urldate = {2024-11-06},
  abstract = {Preregistration is the practice of publicly publishing plans on central components of the research process before access to, or collection, of data. Within the context of the replication crisis, open science practices like preregistration have been pivotal in facilitating greater transparency in research. However, such practices have been applied nearly exclusively to basic academic research, with rare consideration of the relevance to applied and consultancy-based research. This is particularly problematic as such research is typically reported with very low levels of transparency and accountability despite being disseminated as influential gray literature to inform practice. Evidence-based practice is best served by an appreciation of multiple sources of quality evidence, thus the current review considers the potential of preregistration to improve both the accessibility and credibility of applied research toward more rigorous evidence-based practice. The current three-part review outlines, first, the opportunities of preregistration for applied research, and second, three barriers -- practical challenges, stakeholder roles, and the suitability of preregistration. Last, this review makes four recommendations to overcome these barriers and maximize the opportunities of preregistration for academics, industry, and the structures they are held within -- changes to preregistration templates, new types of templates, education and training, and recognition and structural changes.},
  pmid = {34396837},
  keywords = {accountability,Applied research,gray literature,open science,preregistration,transparency},
  file = {/home/michaelb/Zotero/storage/CYN3BKSJ/Evans et al. - 2023 - Improving evidence-based practice through preregistration of applied research Barriers and recommen.pdf}
}

@manual{falbelBonsaiModelWrappers2025,
  type = {Manual},
  title = {Bonsai: {{Model}} Wrappers for Tree-Based Models},
  author = {Falbel, Daniel and Damiani, Athos and Hogervorst, Roel M. and Kuhn, Max and Couch, Simon and Hvitfeldt, Emil},
  year = {2025},
  doi = {10.32614/CRAN.package.bonsai}
}

@misc{FalsePositivePsychologyUndisclosed,
  title = {False-{{Positive Psychology}}: {{Undisclosed Flexibility}} in {{Data Collection}} and {{Analysis Allows Presenting Anything}} as {{Significant}} - {{Joseph P}}. {{Simmons}}, {{Leif D}}. {{Nelson}}, {{Uri Simonsohn}}, 2011},
  urldate = {2024-12-15},
  howpublished = {https://journals.sagepub.com/doi/10.1177/0956797611417632},
  file = {/home/michaelb/Zotero/storage/JQDNBLUB/0956797611417632.html}
}

@article{fanelliNegativeResultsAre2012,
  title = {Negative Results Are Disappearing from Most Disciplines and Countries},
  author = {Fanelli, Daniele},
  year = {2012},
  month = mar,
  journal = {Scientometrics},
  volume = {90},
  number = {3},
  pages = {891--904},
  issn = {1588-2861},
  doi = {10.1007/s11192-011-0494-7},
  urldate = {2024-12-15},
  abstract = {Concerns that the growing competition for funding and citations might distort science are frequently discussed, but have not been verified directly. Of the hypothesized problems, perhaps the most worrying is a worsening of positive-outcome bias. A system that disfavours negative results not only distorts the scientific literature directly, but might also discourage high-risk projects and pressure scientists to fabricate and falsify their data. This study analysed over 4,600 papers published in all disciplines between 1990 and 2007, measuring the frequency of papers that, having declared to have ``tested'' a hypothesis, reported a positive support for it. The overall frequency of positive supports has grown by over 22\% between 1990 and 2007, with significant differences between disciplines and countries. The increase was stronger in the social and some biomedical disciplines. The United States had published, over the years, significantly fewer positive results than Asian countries (and particularly Japan) but more than European countries (and in particular the United Kingdom). Methodological artefacts cannot explain away these patterns, which support the hypotheses that research is becoming less pioneering and/or that the objectivity with which results are produced and published is decreasing.},
  langid = {english},
  keywords = {Bias,Competition,Misconduct,Publication,Publish or perish,Research evaluation},
  file = {/home/michaelb/Zotero/storage/LLSK77JK/Fanelli - 2012 - Negative results are disappearing from most disciplines and countries.pdf}
}

@article{fawcettIntroductionROCAnalysis2006,
  title = {An Introduction to {{ROC}} Analysis},
  author = {Fawcett, Tom},
  year = {2006},
  month = jun,
  journal = {Pattern Recognition Letters},
  series = {{{ROC Analysis}} in {{Pattern Recognition}}},
  volume = {27},
  number = {8},
  pages = {861--874},
  issn = {0167-8655},
  doi = {10.1016/j.patrec.2005.10.010},
  urldate = {2025-08-22},
  abstract = {Receiver operating characteristics (ROC) graphs are useful for organizing classifiers and visualizing their performance. ROC graphs are commonly used in medical decision making, and in recent years have been used increasingly in machine learning and data mining research. Although ROC graphs are apparently simple, there are some common misconceptions and pitfalls when using them in practice. The purpose of this article is to serve as an introduction to ROC graphs and as a guide for using them in research.},
  keywords = {Classifier evaluation,Evaluation metrics,ROC analysis},
  file = {/home/michaelb/Zotero/storage/TI9NNRAY/Fawcett - 2006 - An introduction to ROC analysis.pdf}
}

@article{fergusonSurveyOpenScience2023,
  title = {Survey of Open Science Practices and Attitudes in the Social Sciences},
  author = {Ferguson, Joel and Littman, Rebecca and Christensen, Garret and Paluck, Elizabeth Levy and Swanson, Nicholas and Wang, Zenan and Miguel, Edward and Birke, David and Pezzuto, John-Henry},
  year = {2023},
  month = sep,
  journal = {Nature Communications},
  volume = {14},
  number = {1},
  pages = {5401},
  publisher = {Nature Publishing Group},
  issn = {2041-1723},
  doi = {10.1038/s41467-023-41111-1},
  urldate = {2024-12-15},
  abstract = {Open science practices such as posting data or code and pre-registering analyses are increasingly prescribed and debated in the applied sciences, but the actual popularity and lifetime usage of these practices remain unknown. This study provides an assessment of attitudes toward, use of, and perceived norms regarding open science practices from a sample of authors published in top-10 (most-cited) journals and PhD students in top-20 ranked North American departments from four major social science disciplines: economics, political science, psychology, and sociology. We observe largely favorable private attitudes toward widespread lifetime usage (meaning that a researcher has used a particular practice at least once) of open science practices. As of 2020, nearly 90\% of scholars had ever used at least one such practice. Support for posting data or code online is higher (88\% overall support and nearly at the ceiling in some fields) than support for pre-registration (58\% overall). With respect to norms, there is evidence that the scholars in our sample appear to underestimate the use of open science practices in their field. We also document that the reported lifetime prevalence of open science practices increased from 49\% in 2010 to 87\% a decade later.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Economics,Human behaviour,Interdisciplinary studies,Psychology,Sociology},
  file = {/home/michaelb/Zotero/storage/NYJJF8KD/Ferguson et al. - 2023 - Survey of open science practices and attitudes in the social sciences.pdf}
}

@article{fergusonSurveyOpenScience2023a,
  title = {Survey of Open Science Practices and Attitudes in the Social Sciences},
  author = {Ferguson, Joel and Littman, Rebecca and Christensen, Garret and Paluck, Elizabeth Levy and Swanson, Nicholas and Wang, Zenan and Miguel, Edward and Birke, David and Pezzuto, John-Henry},
  year = {2023},
  month = sep,
  journal = {Nature Communications},
  volume = {14},
  number = {1},
  pages = {5401},
  publisher = {Nature Publishing Group},
  issn = {2041-1723},
  doi = {10.1038/s41467-023-41111-1},
  urldate = {2025-08-01},
  abstract = {Open science practices such as posting data or code and pre-registering analyses are increasingly prescribed and debated in the applied sciences, but the actual popularity and lifetime usage of these practices remain unknown. This study provides an assessment of attitudes toward, use of, and perceived norms regarding open science practices from a sample of authors published in top-10 (most-cited) journals and PhD students in top-20 ranked North American departments from four major social science disciplines: economics, political science, psychology, and sociology. We observe largely favorable private attitudes toward widespread lifetime usage (meaning that a researcher has used a particular practice at least once) of open science practices. As of 2020, nearly 90\% of scholars had ever used at least one such practice. Support for posting data or code online is higher (88\% overall support and nearly at the ceiling in some fields) than support for pre-registration (58\% overall). With respect to norms, there is evidence that the scholars in our sample appear to underestimate the use of open science practices in their field. We also document that the reported lifetime prevalence of open science practices increased from 49\% in 2010 to 87\% a decade later.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Economics,Human behaviour,Interdisciplinary studies,Psychology,Sociology},
  file = {/home/michaelb/Zotero/storage/KJ9C8N8L/Ferguson et al. - 2023 - Survey of open science practices and attitudes in the social sciences.pdf}
}

@article{fessingerStateOpenScience2025,
  title = {The State of Open Science in the Field of Psychology and Law},
  author = {Fessinger, Melanie B. and McAuliff, Bradley D. and Perillo, Anthony D.},
  year = {2025},
  journal = {Law and Human Behavior},
  volume = {49},
  number = {1},
  pages = {54--70},
  publisher = {Educational Publishing Foundation},
  address = {US},
  issn = {1573-661X},
  doi = {10.1037/lhb0000592},
  abstract = {Objective: We conducted a survey to catalog the state of open science in the field of psychology and law. We addressed four major questions: (a) How do psycholegal researchers define open science? (b) How do psycholegal researchers perceive open science? (c) How often do psycholegal researchers use various open science practices? and (d) What barriers, if any, do psycholegal researchers face or expect to face when implementing open science practices? Hypotheses: We did not make specific hypotheses given the exploratory and descriptive nature of the study. Method: We surveyed 740 psychology and law researchers (45\% faculty, 64\% doctoral degree, 66\% women, and 85\% White/non-Hispanic) about their perceptions of and experiences with open science using a mixed-methods design. They defined open science in their own words, described their opinion of the movement, indicated their experiences with any open science practices in their own work (i.e., preregistration, registered reports, open materials, open data, preprints, open access, and open peer review), and identified any barriers or concerns they faced in implementing open science practices. Results: A majority of respondents had wholly positive (60\%) or mostly positive (28\%) perceptions of open science. Most respondents (58\%) had participated in at least one open science practice; however, fewer than half (44\%) had an account on the Open Science Framework or similar repository. The most common barriers mentioned about implementing open science practices were concerns about specific practices (42\%), lacking knowledge (24\%), and requiring more time, effort, or resources (16\%). Conclusions: Like those in other disciplines, psychology and law researchers hold generally positive perceptions of open science that do not completely align with their reported use of specific practices. Overcoming perceived barriers to open science will require education, resources, open discourse, and collaborative problem solving. (PsycInfo Database Record (c) 2025 APA, all rights reserved)},
  keywords = {Collaboration,Forensic Psychology,Laws,Open Science,Problem Solving},
  file = {/home/michaelb/Zotero/storage/2JHRIWN3/Fessinger - 2025 - The State of Open Science in the Field of Psychology and Law.pdf;/home/michaelb/Zotero/storage/8CMYULUX/Fessinger et al. - 2025 - The state of open science in the field of psychology and law.pdf}
}

@article{figueroaPredictingSampleSize2012,
  title = {Predicting Sample Size Required for Classification Performance},
  author = {Figueroa, Rosa L. and {Zeng-Treitler}, Qing and Kandula, Sasikiran and Ngo, Long H.},
  year = {2012},
  month = dec,
  journal = {BMC Medical Informatics and Decision Making},
  volume = {12},
  number = {1},
  pages = {1--10},
  publisher = {BioMed Central},
  issn = {1472-6947},
  doi = {10.1186/1472-6947-12-8},
  urldate = {2024-12-16},
  abstract = {Supervised learning methods need annotated data in order to generate efficient models. Annotated data, however, is a relatively scarce resource and can be expensive to obtain. For both passive and active learning methods, there is a need to estimate the size of the annotated sample required to reach a performance target. We designed and implemented a method that fits an inverse power law model to points of a given learning curve created using a small annotated training set. Fitting is carried out using nonlinear weighted least squares optimization. The fitted model is then used to predict the classifier's performance and confidence interval for larger sample sizes. For evaluation, the nonlinear weighted curve fitting method was applied to a set of learning curves generated using clinical text and waveform classification tasks with active and passive sampling methods, and predictions were validated using standard goodness of fit measures. As control we used an un-weighted fitting method. A total of 568 models were fitted and the model predictions were compared with the observed performances. Depending on the data set and sampling method, it took between 80 to 560 annotated samples to achieve mean average and root mean squared error below 0.01. Results also show that our weighted fitting method outperformed the baseline un-weighted method (p {$<$} 0.05). This paper describes a simple and effective sample size prediction algorithm that conducts weighted fitting of learning curves. The algorithm outperformed an un-weighted algorithm described in previous literature. It can help researchers determine annotation sample size for supervised machine learning.},
  copyright = {2012 Figueroa et al; licensee BioMed Central Ltd.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/P9KWW7SU/Figueroa et al. - 2012 - Predicting sample size required for classification performance.pdf}
}

@article{filomenaEmpiricalCharacterisationAgents2022,
  title = {Empirical Characterisation of Agents' Spatial Behaviour in Pedestrian Movement Simulation},
  author = {Filomena, Gabriele and Kirsch, Lia and Schwering, Angela and Verstegen, Judith A.},
  year = {2022},
  month = aug,
  journal = {Journal of Environmental Psychology},
  volume = {82},
  pages = {101807},
  issn = {0272-4944},
  doi = {10.1016/j.jenvp.2022.101807},
  urldate = {2025-08-07},
  abstract = {Route choice behaviour is a key factor in determining pedestrian movement flows throughout the urban space. Agent-based modelling, a simulation paradigm that allows modelling individual behaviour mechanisms to observe the emergence of macro-level patterns, has not employed empirical data regarding route choice behaviour in cities or accommodated heterogeneity. The aim of this paper is to present an empirically based Agent-Based Model (ABM) that accounts for behavioural heterogeneity in pedestrian route choice strategies, to simulate the movement of pedestrians in cities. We designed a questionnaire to observe to what degree people employ salient urban elements (local and global landmarks, regions, and barriers) and road costs (road distance, cumulative angular change) and to empirically characterise the agent behaviour in our ABM. We hypothesised that a heterogeneous ABM configuration based on the construction of agent typologies from empirical data would portray a more plausible picture of pedestrian movement flows than a homogeneous configuration, based on the same data, or a random configuration. The city of M{\"u}nster (DE) was used as a case study. From a sample of 301 subjects, we obtained six clusters that differed in relation to the role of global elements (distant landmarks, barriers, and regions) and meaningful local elements along the route. The random configuration directed the agents towards natural elements and the streets of the historical centre. The empirically based configurations resulted in lower pedestrian volumes along roads designed for cars (25\% decrease) but higher concentrations along the city Promenade and the lake (40\% increase); based on our knowledge, we deem these results more plausible. Minor differences were identified between the heterogeneous and homogeneous configurations. These findings indicate that the inclusion of heterogeneity does not make a difference in terms of global patterns. Yet, we demonstrated that simulation models of pedestrian movement in cities should be at least based on empirical data at the average sample-level to inform urban planners about areas prone to high volumes of pedestrians.},
  keywords = {Cluster analysis,Cognitive maps,Empirically based agent-based modelling,Pedestrian movement,Route choice behaviour},
  file = {/home/michaelb/Zotero/storage/EC372GXV/Filomena et al. - 2022 - Empirical characterisation of agents’ spatial behaviour in pedestrian movement simulation.pdf;/home/michaelb/Zotero/storage/3FX2LDDA/S0272494422000524.html}
}

@article{finkReplicationCodeAvailability2024,
  title = {Replication Code Availability over Time and across Fields: {{Evidence}} from the {{German Socio-Economic Panel}}},
  shorttitle = {Replication Code Availability over Time and across Fields},
  author = {Fink, Lukas and Marcus, Jan},
  year = {2024},
  journal = {Economic Inquiry},
  volume = {63},
  number = {1},
  issn = {1465-7295},
  doi = {10.1111/ecin.13267},
  urldate = {2024-12-15},
  abstract = {Providing replication code is an inexpensive way to facilitate reproducibility. However, little is known about the extent of replication code provision. Therefore, we examine the availability of replication code for over 2500 peer-reviewed articles based on the German Socio-Economic Panel (SOEP), one of the most widely used datasets in economics and other social sciences. We find that only 6\% of SOEP-based studies have code available, but that this proportion has increased sharply over time. We provide evidence that the increase in code provision is driven by technological advances, individual researcher initiatives, and journal policies.},
  copyright = {{\copyright} 2024 The Author(s). Economic Inquiry published by Wiley Periodicals LLC on behalf of Western Economic Association International.},
  langid = {english},
  keywords = {code availability,journal policies,replication code,reproducibility,SOEP},
  file = {/home/michaelb/Zotero/storage/W33WQ3CA/Fink and Marcus - Replication code availability over time and across fields Evidence from the German Socio-Economic P.pdf;/home/michaelb/Zotero/storage/7ENSBIJZ/ecin.html}
}

@book{firebaughSevenRulesSocial2008a,
  title = {Seven Rules for Social Research},
  author = {Firebaugh, Glenn},
  year = {2008},
  publisher = {Princeton University Press},
  address = {Princeton},
  isbn = {978-0-691-13567-0 978-0-691-12546-6 978-0-691-19043-3},
  langid = {english}
}

@article{fischer-baumStuckNeuralEvents2013,
  title = {Stuck in the {{Past}}: {{Neural Events That Predict Intrusions From Prior Trials}}},
  shorttitle = {Stuck in the {{Past}}},
  author = {{Fischer-Baum}, Simon and Gonsalves, Brian D.},
  year = {2013},
  month = may,
  journal = {Psychological Science},
  volume = {24},
  number = {5},
  pages = {742--750},
  publisher = {SAGE Publications Inc},
  issn = {0956-7976},
  doi = {10.1177/0956797612461450},
  urldate = {2025-07-26},
  abstract = {Neurologically intact adults perseverate in immediate serial recall, intruding items from a previous trial into the current response. We applied the electroencephalogram/event-related-potential subsequent-memory paradigm to immediate serial recall to investigate the causes of these errors. In line with previous studies using this paradigm, results revealed that words that were correctly recalled elicited a greater frontal positivity during encoding when compared with words that were either perseverated over or not produced for some other reason. More surprisingly, differences were also found at encoding between the words perseverated into the subsequent response and words that were not perseverated. These findings support a theory stating that abnormalities in both how the current target and the previous trial are processed can contribute to perseveration errors. These results inform existing theories of immediate serial recall and theories of the control of irrelevant information.},
  langid = {english}
}

@article{formanExtensiveEmpiricalStudy2003,
  title = {An Extensive Empirical Study of Feature Selection Metrics for Text Classification},
  author = {Forman, George},
  year = {2003},
  month = mar,
  journal = {J. Mach. Learn. Res.},
  volume = {3},
  number = {null},
  pages = {1289--1305},
  issn = {1532-4435},
  abstract = {Machine learning for text classification is the cornerstone of document categorization, news filtering, document routing, and personalization. In text domains, effective feature selection is essential to make the learning task efficient and more accurate. This paper presents an empirical comparison of twelve feature selection methods (e.g. Information Gain) evaluated on a benchmark of 229 text classification problem instances that were gathered from Reuters, TREC, OHSUMED, etc. The results are analyzed from multiple goal perspectives-accuracy, F-measure, precision, and recall-since each is appropriate in different situations. The results reveal that a new feature selection metric we call 'Bi-Normal Separation' (BNS), outperformed the others by a substantial margin in most situations. This margin widened in tasks with high class skew, which is rampant in text classification problems and is particularly challenging for induction algorithms. A new evaluation methodology is offered that focuses on the needs of the data mining practitioner faced with a single dataset who seeks to choose one (or a pair of) metrics that are most likely to yield the best performance. From this perspective, BNS was the top single choice for all goals except precision, for which Information Gain yielded the best result most often. This analysis also revealed, for example, that Information Gain and Chi-Squared have correlated failures, and so they work poorly together. When choosing optimal pairs of metrics for each of the four performance goals, BNS is consistently a member of the pair---e.g., for greatest recall, the pair BNS + F1-measure yielded the best performance on the greatest number of tasks by a considerable margin.},
  file = {/home/michaelb/Zotero/storage/Y2TE2VMG/Forman - 2003 - An extensive empirical study of feature selection metrics for text classification.pdf}
}

@article{fox142OpenScience2021,
  title = {142 {{Open Science}}: {{Improving Access}} and {{Reducing Bias}} in {{Science}}},
  shorttitle = {142 {{Open Science}}},
  author = {Fox, Nick},
  year = {2021},
  month = nov,
  journal = {Journal of Animal Science},
  volume = {99},
  number = {Supplement\_3},
  pages = {75--76},
  issn = {1525-3163},
  doi = {10.1093/jas/skab235.136},
  urldate = {2024-12-13},
  abstract = {The promise of science lies in the discovery of basic knowledge, new treatments for disease and possible solutions to the world's problems. Fulfilling this promise requires confidence that the findings of published science are valid---that they represent an unbiased conclusion based on available data. In recent years, however, a ``reproducibility crisis'' has emerged indicating that published findings across research fields may be less credible than they seem, perhaps due to hidden biases in the research process. This talk will provide an overview of the key challenges that reduce the credibility and reproducibility of research and will discuss how open science practices address these challenges. Current practice is sustained by a dysfunctional incentive structure that prioritizes publication over accuracy. Changing the research culture to prioritize ``getting it right'' over ``getting it published'' requires nudges to the incentive landscape, while still fueling the engine of innovation and discovery that drives science into new domains.},
  file = {/home/michaelb/Zotero/storage/L5KP3EZB/Fox - 2021 - 142 Open Science Improving Access and Reducing Bias in Science.pdf;/home/michaelb/Zotero/storage/73RP5SGM/6383842.html}
}

@book{francoHandbuchKarlPopper2019,
  title = {{Handbuch Karl Popper}},
  editor = {Franco, Giuseppe},
  year = {2019},
  publisher = {Springer Fachmedien},
  address = {Wiesbaden},
  doi = {10.1007/978-3-658-16239-9},
  urldate = {2024-12-11},
  copyright = {http://www.springer.com/tdm},
  isbn = {978-3-658-16238-2 978-3-658-16239-9},
  langid = {ngerman},
  keywords = {Falsifikation,Kritischer Rationalismus,Popper Karl,Positivismusstreit,Wissenschaftstheorie},
  file = {/home/michaelb/Zotero/storage/XGF9DSKG/Franco - 2019 - Handbuch Karl Popper.pdf}
}

@article{francoPublicationBiasSocial2014,
  title = {Publication Bias in the Social Sciences: {{Unlocking}} the File Drawer},
  shorttitle = {Publication Bias in the Social Sciences},
  author = {Franco, Annie and Malhotra, Neil and Simonovits, Gabor},
  year = {2014},
  month = sep,
  journal = {Science},
  volume = {345},
  number = {6203},
  pages = {1502--1505},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.1255484},
  urldate = {2024-11-06},
  abstract = {We studied publication bias in the social sciences by analyzing a known population of conducted studies---221 in total---in which there is a full accounting of what is published and unpublished. We leveraged Time-sharing Experiments in the Social Sciences (TESS), a National Science Foundation--sponsored program in which researchers propose survey-based experiments to be run on representative samples of American adults. Because TESS proposals undergo rigorous peer review, the studies in the sample all exceed a substantial quality threshold. Strong results are 40 percentage points more likely to be published than are null results and 60 percentage points more likely to be written up. We provide direct evidence of publication bias and identify the stage of research production at which publication bias occurs: Authors do not write up and submit null findings.},
  file = {/home/michaelb/Zotero/storage/3INXI5Z4/Franco et al. - 2014 - Publication bias in the social sciences Unlocking the file drawer.pdf}
}

@article{freeseAdvancesTransparencyReproducibility2022,
  title = {Advances in Transparency and Reproducibility in the Social Sciences},
  author = {Freese, Jeremy and Rauf, Tamkinat and Voelkel, Jan Gerrit},
  year = {2022},
  month = sep,
  journal = {Social Science Research},
  volume = {107},
  pages = {102770},
  issn = {0049-089X},
  doi = {10.1016/j.ssresearch.2022.102770},
  urldate = {2024-12-15},
  abstract = {Worries about a ``credibility crisis'' besieging science have ignited interest in research transparency and reproducibility as ways of restoring trust in published research. For quantitative social science, advances in transparency and reproducibility can be seen as a set of developments whose trajectory predates the recent alarm. We discuss several of these developments, including preregistration, data-sharing, formal infrastructure in the form of resources and policies, open access to research, and specificity regarding research contributions. We also discuss the spillovers of this predominantly quantitative effort towards transparency for qualitative research. We conclude by emphasizing the importance of mutual accountability for effective science, the essential role of openness for this accountability, and the importance of scholarly inclusiveness in figuring out the best ways for openness to be accomplished in practice.},
  keywords = {Open science,Reproducibility,Transparency},
  file = {/home/michaelb/Zotero/storage/UTPDRL49/S0049089X2200076X.html}
}

@article{freeseReplicationSocialScience2017,
  title = {Replication in {{Social Science}}},
  author = {Freese, Jeremy and Peterson, David},
  year = {2017},
  month = jul,
  journal = {Annual Review of Sociology},
  volume = {43},
  number = {Volume 43, 2017},
  pages = {147--165},
  publisher = {Annual Reviews},
  issn = {0360-0572, 1545-2115},
  doi = {10.1146/annurev-soc-060116-053450},
  urldate = {2024-11-06},
  abstract = {Across the medical and social sciences, new discussions about replication have led to transformations in research practice. Sociologists, however, have been largely absent from these discussions. The goals of this review are to introduce sociologists to these developments, synthesize insights from science studies about replication in general, and detail the specific issues regarding replication that occur in sociology. The first half of the article argues that a sociologically sophisticated understanding of replication must address both the ways that replication rules and conventions evolved within an epistemic culture and how those cultures are shaped by specific research challenges. The second half outlines the four main dimensions of replicability in quantitative sociology---verifiability, robustness, repeatability, and generalizability---and discusses the specific ambiguities of interpretation that can arise in each. We conclude by advocating some commonsense changes to promote replication while acknowledging the epistemic diversity of our field.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/JEUNVQE5/Freese and Peterson - 2017 - Replication in Social Science.pdf;/home/michaelb/Zotero/storage/62Q3HELK/annurev-soc-060116-053450.html}
}

@article{freeseReplicationSocialScience2017a,
  title = {Replication in {{Social Science}}},
  author = {Freese, Jeremy and Peterson, David},
  year = {2017},
  month = jul,
  journal = {Annual Review of Sociology},
  volume = {43},
  number = {Volume 43, 2017},
  pages = {147--165},
  publisher = {Annual Reviews},
  issn = {0360-0572, 1545-2115},
  doi = {10.1146/annurev-soc-060116-053450},
  urldate = {2024-12-15},
  abstract = {Across the medical and social sciences, new discussions about replication have led to transformations in research practice. Sociologists, however, have been largely absent from these discussions. The goals of this review are to introduce sociologists to these developments, synthesize insights from science studies about replication in general, and detail the specific issues regarding replication that occur in sociology. The first half of the article argues that a sociologically sophisticated understanding of replication must address both the ways that replication rules and conventions evolved within an epistemic culture and how those cultures are shaped by specific research challenges. The second half outlines the four main dimensions of replicability in quantitative sociology---verifiability, robustness, repeatability, and generalizability---and discusses the specific ambiguities of interpretation that can arise in each. We conclude by advocating some commonsense changes to promote replication while acknowledging the epistemic diversity of our field.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/FZNRE6US/Freese and Peterson - 2017 - Replication in Social Science.pdf;/home/michaelb/Zotero/storage/MVZMR367/annurev-soc-060116-053450.html}
}

@article{freeseReplicationStandardsQuantitative2007,
  title = {Replication {{Standards}} for {{Quantitative Social Science}}: {{Why Not Sociology}}?},
  shorttitle = {Replication {{Standards}} for {{Quantitative Social Science}}},
  author = {Freese, Jeremy},
  year = {2007},
  month = nov,
  journal = {Sociological Methods \& Research},
  volume = {36},
  number = {2},
  pages = {153--172},
  publisher = {SAGE Publications Inc},
  issn = {0049-1241},
  doi = {10.1177/0049124107306659},
  urldate = {2024-12-15},
  abstract = {The credibility of quantitative social science benefits from policies that increase confidence that results reported by one researcher can be verified by others. Concerns about replicability have increased as the scale and sophistication of analyses increase the possible dependence of results on subtle analytic decisions and decrease the extent to which published articles contain full descriptions of methods. The author argues that sociology should adopt standards regarding replication that minimize its conceptualization as an ethical and individualistic matter and advocates for a policy in which authors use independent online archives to deposit the maximum possible information for replicating published results at the time of publication and are explicit about the conditions of availability for any necessary materials that are not provided. The author responds to several objections that might be raised to increasing the transparency of quantitative sociology in this way and offers a candidate replication policy for sociology.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/2HDDFW84/Freese - 2007 - Replication Standards for Quantitative Social Science Why Not Sociology.pdf}
}

@manual{frickRsampleGeneralResampling2025,
  type = {Manual},
  title = {Rsample: {{General}} Resampling Infrastructure},
  author = {Frick, Hannah and Chow, Fanny and Kuhn, Max and Mahoney, Michael and Silge, Julia and Wickham, Hadley},
  year = {2025}
}

@article{friedmanRegularizationPathsGeneralized2010,
  title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
  author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert},
  year = {2010},
  journal = {Journal of Statistical Software},
  volume = {33},
  number = {1},
  pages = {1--22},
  doi = {10.18637/jss.v033.i01}
}

@article{fuentesOpenScienceFriend2023,
  title = {Open Science: {{Friend}}, Foe, or Both to an Antiracist Psychology?},
  shorttitle = {Open Science},
  author = {Fuentes, Milton A. and Zelaya, David G. and {Delgado-Romero}, Edward A. and Butt, Mamona},
  year = {2023},
  month = oct,
  journal = {Psychological Review},
  volume = {130},
  number = {5},
  pages = {1351--1359},
  issn = {1939-1471},
  doi = {10.1037/rev0000386},
  abstract = {The open science framework has garnered increased visibility and has been partially implemented in recent years. Open science underscores the importance of transparency and reproducibility to conduct rigorous science. Recently, several journals published by the American Psychological Association have begun adopting the open science framework. At the same time, the field of psychology has been reckoning with the current sociopolitical climate regarding anti-Blackness and White supremacy. As psychology begins to adopt the open science framework into its journals, the authors underscore the importance of embracing and aligning open science with frameworks and theories that have the potential to move the field toward antiracism and away from the embedded White supremacy value systems and ideals. The present article provides an overview of the open science framework; an examination of White supremacy ideology in research and publishing; guidance on how to move away from these pernicious values; and a proposal on alternate value systems to center equity, diversity, and inclusion with the aim of establishing an antiracist open science framework. (PsycInfo Database Record (c) 2024 APA, all rights reserved).},
  langid = {english},
  pmid = {35834186},
  keywords = {Humans,Psychology,Racism}
}

@article{gagolewskiStringiFastPortable2022,
  title = {{{stringi}}: {{Fast}} and Portable Character String Processing in {{R}}},
  author = {Gagolewski, Marek},
  year = {2022},
  journal = {Journal of Statistical Software},
  volume = {103},
  number = {2},
  pages = {1--59},
  doi = {10.18637/jss.v103.i02}
}

@manual{gamerIrrVariousCoefficients2019,
  type = {Manual},
  title = {Irr: {{Various}} Coefficients of Interrater Reliability and Agreement},
  author = {Gamer, Matthias and Lemon, Jim and {\textexclamdown}puspendra.pusp22@gmail.com{\textquestiondown}, Ian Fellows Puspendra Singh},
  year = {2019}
}

@article{gelmanInductionDeductionBaysian2011,
  title = {Induction and {{Deduction}} in {{Baysian Data Analysis}}},
  author = {Gelman, A.},
  year = {2011},
  journal = {Rationality, Markets and Morals},
  urldate = {2024-12-13},
  abstract = {The classical or frequentist approach to statistics (in which inference is centered on significance testing), is associated with a philosophy in which science is deductive and follows Popperis doctrine of falsification. In contrast, Bayesian inference is commonly associated with inductive reasoning and the idea that a model can be dethroned by a competing model but can never be directly falsified by a significance test. The purpose of this article is to break these associations, which I think are incorrect and have been detrimental to statistical practice, in that they have steered falsificationists away from the very useful tools of Bayesian inference and have discouraged Bayesians from checking the fit of their models. From my experience using and developing Bayesian methods in social and environmental science, I have found model checking and falsification to be central in the modeling process.},
  file = {/home/michaelb/Zotero/storage/SRF9DCGD/Gelman - 2011 - Induction and Deduction in Baysian Data Analysis.pdf}
}

@article{gemanNeuralNetworksBias1992,
  title = {Neural {{Networks}} and the {{Bias}}/{{Variance Dilemma}}},
  author = {Geman, Stuart and Bienenstock, Elie and Doursat, Ren{\'e}},
  year = {1992},
  month = jan,
  journal = {Neural Computation},
  volume = {4},
  number = {1},
  pages = {1--58},
  issn = {0899-7667},
  doi = {10.1162/neco.1992.4.1.1},
  urldate = {2025-07-28},
  abstract = {Feedforward neural networks trained by error backpropagation are examples of nonparametric regression estimators. We present a tutorial on nonparametric inference and its relation to neural networks, and we use the statistical viewpoint to highlight strengths and weaknesses of neural models. We illustrate the main points with some recognition experiments involving artificial data as well as handwritten numerals. In way of conclusion, we suggest that current-generation feedforward neural networks are largely inadequate for difficult problems in machine perception and machine learning, regardless of parallel-versus-serial hardware or other implementation issues. Furthermore, we suggest that the fundamental challenges in neural modeling are about representation rather than learning per se. This last point is supported by additional experiments with handwritten numerals.}
}

@article{gerasimovComparisonDatasetsCitation2024,
  title = {Comparison of Datasets Citation Coverage in {{Google Scholar}}, {{Web}} of {{Science}}, {{Scopus}}, {{Crossref}}, and {{DataCite}}},
  author = {Gerasimov, Irina and KC, Binita and Mehrabian, Armin and Acker, James and McGuire, Michael P.},
  year = {2024},
  month = jul,
  journal = {Scientometrics},
  volume = {129},
  number = {7},
  pages = {3681--3704},
  issn = {1588-2861},
  doi = {10.1007/s11192-024-05073-5},
  urldate = {2025-07-15},
  abstract = {The rapid increase of Earth science data from remote sensing, models, and ground-based observations highlights an urgent need for effective data management practices. Data repositories track provenance and usage metrics which are crucial for ensuring data integrity and scientific reproducibility. Although the introduction of Digital Object Identifiers (DOIs) for datasets in the late 1990s has significantly aided in crediting creators and enhancing dataset discoverability (akin to traditional research citations), considerable challenges persist in establishing linkage of datasets used with scholarly documents. This study evaluates the citation coverage of datasets from NASA's Earth Observing System Data and Information System (EOSDIS) across several major bibliographic sources - namely Google Scholar (GS), Web of Science (WoS), Scopus, Crossref, and DataCite---which helps data managers in making informed decisions when selecting bibliographic sources. We provide a robust and comprehensive understanding of the citation landscape, crucial for advancing data management practices and advancing open science. Our study searched and analyzed temporal trends across the bibliographic sources for publications that cite approximately 11,000 DOIs associated with EOSDIS datasets, yielding 17,000 unique journal and conference articles, reports, and book records linked to 3,000 dataset DOIs. GS emerged as the most comprehensive source while Crossref lagged significantly behind the other major sources. Crossref's record references revealed that the absence of dataset DOIs and shortcomings in the Crossref Event data interface likely contributed to its underperformance. Scopus initially outperformed WoS until 2020, after which WoS began to show superior performance. Overall, our study underscores the necessity of utilizing multiple bibliographic sources for citation analysis, particularly for exploring dataset-to-document connections.},
  langid = {english},
  keywords = {Bibliographic databases,Biological Databases,COCI,Crossref,Data publication and archiving,Data-driven Science Modeling and Theory Building,DataCite,Dataset citation,Digital Humanities,DOI,EOSDIS,Google Scholar,Open science,Research Skills,Scientific impact,Scopus,Survey Methodology,Web of Science},
  file = {/home/michaelb/Zotero/storage/NZLGRNNG/Gerasimov et al. - 2024 - Comparison of datasets citation coverage in Google Scholar, Web of Science, Scopus, Crossref, and Da.pdf}
}

@article{gerberPublicationBiasEmpirical2008,
  title = {Publication {{Bias}} in {{Empirical Sociological Research}}: {{Do Arbitrary Significance Levels Distort Published Results}}?},
  shorttitle = {Publication {{Bias}} in {{Empirical Sociological Research}}},
  author = {Gerber, Alan S. and Malhotra, Neil},
  year = {2008},
  month = aug,
  journal = {Sociological Methods \& Research},
  volume = {37},
  number = {1},
  pages = {3--30},
  publisher = {SAGE Publications Inc},
  issn = {0049-1241},
  doi = {10.1177/0049124108318973},
  urldate = {2024-12-15},
  abstract = {Despite great attention to the quality of research methods in individual studies, if publication decisions of journals are a function of the statistical significance of research findings, the published literature as a whole may not produce accurate measures of true effects. This article examines the two most prominent sociology journals (the American Sociological Review and the American Journal of Sociology) and another important though less influential journal (The Sociological Quarterly) for evidence of publication bias. The effect of the .05 significance level on the pattern of published findings is examined using a ``caliper'' test, and the hypothesis of no publication bias can be rejected at approximately the 1 in 10 million level. Findings suggest that some of the results reported in leading sociology journals may be misleading and inaccurate due to publication bias. Some reasons for publication bias and proposed reforms to reduce its impact on research are also discussed.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/S6Y6KRTC/Gerber and Malhotra - 2008 - Publication Bias in Empirical Sociological Research Do Arbitrary Significance Levels Distort Publis.pdf}
}

@inproceedings{gilpinExplainingExplanationsOverview2018,
  title = {Explaining Explanations: {{An}} Overview of Interpretability of Machine Learning},
  shorttitle = {Explaining Explanations},
  booktitle = {Proc. - {{IEEE Int}}. {{Conf}}. {{Data Sci}}. {{Adv}}. {{Anal}}., {{DSAA}}},
  author = {Gilpin, Leilani H. and Bau, David and Yuan, Ben Z. and Bajwa, Ayesha and Specter, Michael and Kagal, Lalana},
  year = {2018},
  pages = {80--89},
  publisher = {{Institute of Electrical and Electronics Engineers Inc.}},
  doi = {10.1109/DSAA.2018.00018},
  abstract = {There has recently been a surge of work in explanatory artificial intelligence (XAI). This research area tackles the important problem that complex machines and algorithms often cannot provide insights into their behavior and thought processes. XAI allows users and parts of the internal system to be more transparent, providing explanations of their decisions in some level of detail. These explanations are important to ensure algorithmic fairness, identify potential bias/problems in the training data, and to ensure that the algorithms perform as expected. However, explanations produced by these systems is neither standardized nor systematically assessed. In an effort to create best practices and identify open challenges, we describe foundational concepts of explainability and show how they can be used to classify existing literature. We discuss why current approaches to explanatory methods especially for deep neural networks are insufficient. Finally, based on our survey, we conclude with suggested future research directions for explanatory artificial intelligence. {\copyright} 2018 IEEE.},
  isbn = {978-1-5386-5090-5},
  langid = {english},
  keywords = {Deep learning and deep analytics,Fairness and transparency in data science,Machine learning theories,Models and systems},
  file = {/home/michaelb/Zotero/storage/S6R4WTID/Gilpin et al. - 2018 - Explaining explanations An overview of interpretability of machine learning.pdf;/home/michaelb/Zotero/storage/I2QWS7N2/85062824495.html}
}

@article{goodmanTenSimpleRules2014,
  title = {Ten {{Simple Rules}} for the {{Care}} and {{Feeding}} of {{Scientific Data}}},
  author = {Goodman, Alyssa and Pepe, Alberto and Blocker, Alexander W. and Borgman, Christine L. and Cranmer, Kyle and Crosas, Merce and Stefano, Rosanne Di and Gil, Yolanda and Groth, Paul and Hedstrom, Margaret and Hogg, David W. and Kashyap, Vinay and Mahabal, Ashish and Siemiginowska, Aneta and Slavkovic, Aleksandra},
  year = {2014},
  month = apr,
  journal = {PLOS Computational Biology},
  volume = {10},
  number = {4},
  pages = {e1003542},
  publisher = {Public Library of Science},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1003542},
  urldate = {2024-12-13},
  langid = {english},
  keywords = {Archives,Computer software,Data management,Data visualization,Metadata,Open source software,Scientists,Software tools},
  file = {/home/michaelb/Zotero/storage/EKZW5LUC/Goodman et al. - 2014 - Ten Simple Rules for the Care and Feeding of Scientific Data.pdf}
}

@article{goodwinWhatWeKnow2017,
  title = {What {{Do We Know}} about {{Legal Empowerment}}? {{Mapping}} the {{Evidence}}},
  shorttitle = {What {{Do We Know}} about {{Legal Empowerment}}?},
  author = {Goodwin, Laura and Maru, Vivek},
  year = {2017},
  month = apr,
  journal = {Hague Journal on the Rule of Law},
  volume = {9},
  number = {1},
  pages = {157--194},
  issn = {1876-4053},
  doi = {10.1007/s40803-016-0047-5},
  urldate = {2025-07-24},
  abstract = {World governments have embraced `legal empowerment' as an end in itself and as an essential element in the fight against poverty. Civil society groups work to advance legal empowerment around the globe but, to date, there is not a comprehensive understanding of the impact of those efforts. This article offers the first review and mapping of existing evidence on legal empowerment. We identified and analyzed 199 studies in total. These studies span every major continent and address a wide range of legal empowerment interventions, such as legal literacy, community-based paralegals, and use of right to information laws. The breadth and richness of this body of work suggest we should revisit previous perceptions that there is little evidence on what legal empowerment can achieve. Stronger agency---both people's willingness to act and actual action---as well as increased legal knowledge are the most common positive impacts reported in this collection of evidence. The evidence also suggests legal empowerment programs can lead to acquisition of legal remedies, effective conflict resolution, and even improvements in health and education outcomes. Nearly ninety studies find positive impacts of legal empowerment programs on institutions---changes in law, policy or practice at various levels of administration. After exploring distributions and trends in the evidence, the article concludes by identifying gaps and questions for further inquiry to guide future research and, ultimately, promote stronger, more evidence-based practice.},
  langid = {english},
  keywords = {Evidence,Human Rights,Justice,Legal Empowerment,Positive Psychology,Public Law,Social Work,Social Work Research,Socio-Legal Studies,Third Sector Research},
  file = {/home/michaelb/Zotero/storage/YXKDEK63/Goodwin and Maru - 2017 - What Do We Know about Legal Empowerment Mapping the Evidence.pdf}
}

@article{goodwinWhatWeKnow2017a,
  title = {What {{Do We Know}} about {{Legal Empowerment}}? {{Mapping}} the {{Evidence}}},
  shorttitle = {What {{Do We Know}} about {{Legal Empowerment}}?},
  author = {Goodwin, Laura and Maru, Vivek},
  year = {2017},
  month = apr,
  journal = {Hague Journal on the Rule of Law},
  volume = {9},
  number = {1},
  pages = {157--194},
  issn = {1876-4053},
  doi = {10.1007/s40803-016-0047-5},
  urldate = {2025-07-25},
  abstract = {World governments have embraced `legal empowerment' as an end in itself and as an essential element in the fight against poverty. Civil society groups work to advance legal empowerment around the globe but, to date, there is not a comprehensive understanding of the impact of those efforts. This article offers the first review and mapping of existing evidence on legal empowerment. We identified and analyzed 199 studies in total. These studies span every major continent and address a wide range of legal empowerment interventions, such as legal literacy, community-based paralegals, and use of right to information laws. The breadth and richness of this body of work suggest we should revisit previous perceptions that there is little evidence on what legal empowerment can achieve. Stronger agency---both people's willingness to act and actual action---as well as increased legal knowledge are the most common positive impacts reported in this collection of evidence. The evidence also suggests legal empowerment programs can lead to acquisition of legal remedies, effective conflict resolution, and even improvements in health and education outcomes. Nearly ninety studies find positive impacts of legal empowerment programs on institutions---changes in law, policy or practice at various levels of administration. After exploring distributions and trends in the evidence, the article concludes by identifying gaps and questions for further inquiry to guide future research and, ultimately, promote stronger, more evidence-based practice.},
  langid = {english},
  keywords = {Evidence,Human Rights,Justice,Legal Empowerment,Positive Psychology,Public Law,Social Work,Social Work Research,Socio-Legal Studies,Third Sector Research},
  file = {/home/michaelb/Zotero/storage/M5B35QY3/Goodwin and Maru - 2017 - What Do We Know about Legal Empowerment Mapping the Evidence.pdf}
}

@article{greenspanOpenSciencePractices2024,
  title = {Open Science Practices in Criminology and Criminal Justice Journals},
  author = {Greenspan, Rachel Leigh and Baggett, Logan and B. Boutwell, Brian},
  year = {2024},
  month = sep,
  journal = {Journal of Experimental Criminology},
  issn = {1572-8315},
  doi = {10.1007/s11292-024-09640-x},
  urldate = {2024-11-06},
  abstract = {Calls for more transparent and replicable scientific practices have been increasing across scientific disciplines over the last decade, often referred to as the open science movement. Open science practices are arguably particularly important in fields like criminology and criminal justice where empirical findings aim to inform public policy and legal practice. Despite favorable views of these practices by criminal justice scholars, limited research has explored how often researchers actually use these open science practices.},
  langid = {english},
  keywords = {Criminology,Open code,Open data,Open materials,Open science,Pre-registration},
  file = {/home/michaelb/Zotero/storage/I2BVQP5G/Greenspan et al. - 2024 - Open science practices in criminology and criminal justice journals.pdf}
}

@book{grossmannHowSocialScience2021,
  title = {How {{Social Science Got Better}}: {{Overcoming Bias}} with {{More Evidence}}, {{Diversity}}, and {{Self-Reflection}}},
  shorttitle = {How {{Social Science Got Better}}},
  author = {Grossmann, Matt},
  year = {2021},
  month = oct,
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780197518977.001.0001},
  urldate = {2025-08-26},
  abstract = {Social science research is facing mounting criticism, as canonical studies fail to replicate, questionable research practices abound, and researcher social and political biases come under fire. Far from being in crisis, however, social science is undergoing an unparalleled renaissance of ever-broader and deeper understanding and application---made possible by close attention to criticism of our biases and open public engagement. Wars between scientists and their humanist critics, methodological disputes over statistical practice and qualitative research, and disciplinary battles over grand theories of human nature have all quietly died down as new generations of scholars have integrated the insights of multiple sides. Rather than deny that researcher biases affect results, scholars now closely analyze how our racial, gender, geographic, methodological, political, and ideological differences impact our research questions; how the incentives of academia influence our research practices; and how universal human desires to avoid uncomfortable truths and easily solve problems affect our conclusions. To be sure, misaligned incentive structures remain, but a messy, collective deliberation across the research community is boosting self-knowledge and improving practice. Ours is an unprecedented age of theoretical diversity, open and connected data, and public scholarship. How Social Science Got Better documents and explains recent transformations, crediting both internal and public critics for strengthening social science. Applying insights from the philosophy, history, and sociology of science and providing new data on trends in social science research and scholarly views, it demonstrates that social science has never been more relevant, rigorous, or self-reflective.},
  isbn = {978-0-19-751897-7}
}

@incollection{grossmannOpenScienceReform2021,
  title = {Open {{Science Reform}} and {{Social Science Progress}}},
  booktitle = {How {{Social Science Got Better}}: {{Overcoming Bias}} with {{More Evidence}}, {{Diversity}}, and {{Self-Reflection}}},
  author = {Grossmann, Matt},
  editor = {Grossmann, Matt},
  year = {2021},
  month = oct,
  pages = {0},
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780197518977.003.0002},
  urldate = {2025-08-26},
  abstract = {Social science reform focused on research documenting problems of replication and proposed open science practices. The associated debates have drawn attention to the many biases involved in research and to the misaligned professional incentives that perpetuate them. The reform efforts have made considerable progress quickly, in self-understanding and even in changing research practices. Where it has gone too far in emphasizing experimental methodologies for testing of causal hypotheses, reformers and critics alike have promoted procedures that reflect social science diversity and acknowledge the importance of self-conscious exploratory work. In the process, several social science revolutions have made shared progress more likely: middle-range empiricism has risen over grand theory; open and big data has stimulated new work while enabling cross-checking; new causal identification strategies have enabled observational work to speak to experimental concerns; and the rise of team science has forced us to reconcile theoretical perspectives and build on individual strengths.},
  isbn = {978-0-19-751897-7},
  file = {/home/michaelb/Zotero/storage/XJRPC2EQ/Grossmann - 2021 - Open Science Reform and Social Science Progress.pdf}
}

@inbook{grossmannReasonsCautiousOptimism2021,
  title = {Reasons for {{Cautious Optimism}}},
  booktitle = {How {{Social Science Got Better}}: {{Overcoming Bias}} with {{More Evidence}}, {{Diversity}}, and {{Self-Reflection}}},
  author = {Grossmann, Matt},
  year = {2021},
  month = oct,
  pages = {0},
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780197518977.003.0011},
  urldate = {2025-08-26},
  abstract = {The explosion of data collection and availability, the expansion of academia and the spread of ideas, and innovations in theory and method all suggest bright days ahead for social science. Addressing human collective challenges such as climate change, poverty, and public health depends on the advance of social science. I revisit the benefits of accounting for human bias in advancing these efforts and for the further understanding of ourselves. I embrace reforms, but as pieces of a pluralist landscape rather than strictures. Descriptive inferences of generalized patterns, causal inference, and qualitative explorations will all remain important to the advance of social knowledge.},
  collaborator = {Grossmann, Matt},
  isbn = {978-0-19-751897-7},
  file = {/home/michaelb/Zotero/storage/7RP3JGCG/Grossmann - 2021 - Reasons for Cautious Optimism.pdf}
}

@incollection{grossmannSocialScienceBiases2021,
  title = {Social {{Science Biases}} and {{Collective Knowledge}}},
  booktitle = {How {{Social Science Got Better}}: {{Overcoming Bias}} with {{More Evidence}}, {{Diversity}}, and {{Self-Reflection}}},
  author = {Grossmann, Matt},
  editor = {Grossmann, Matt},
  year = {2021},
  month = oct,
  pages = {0},
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780197518977.003.0001},
  urldate = {2025-08-26},
  abstract = {Understanding, investigating, and adapting to the biases inherent in social science research is the best path toward accumulating and advancing social science knowledge. Social science faces many categories of bias, from those stemming from unrepresentative researcher demographics to those based on research practices and incentives. Each has implications for research practices, but none makes social science impossible. Scholars face inherent challenges larger than those of natural scientists, with more disagreement on the most important biases to address and the kinds of research necessary to do so. But there are important advances in scholars' self-understanding that can serve as the basis for our future progress.},
  isbn = {978-0-19-751897-7},
  file = {/home/michaelb/Zotero/storage/SNFSZX4R/Grossmann - 2021 - Social Science Biases and Collective Knowledge.pdf}
}

@article{gtsummary,
  title = {Reproducible Summary Tables with the Gtsummary Package},
  author = {Sjoberg, Daniel D. and Whiting, Karissa and Curry, Michael and Lavery, Jessica A. and Larmarange, Joseph},
  year = {2021},
  journal = {The R Journal},
  volume = {13},
  number = {1},
  pages = {570--580},
  doi = {10.32614/RJ-2021-053}
}

@inproceedings{guerraQualitativeQuantitativeAnalysis2013,
  title = {A {{Qualitative}} and {{Quantitative Analysis}} on {{Metadata-Based Frameworks Usage}}},
  booktitle = {Computational {{Science}} and {{Its Applications}} -- {{ICCSA}} 2013},
  author = {Guerra, Eduardo and Fernandes, Clovis},
  editor = {Murgante, Beniamino and Misra, Sanjay and Carlini, Maurizio and Torre, Carmelo M. and Nguyen, Hong-Quang and Taniar, David and Apduhan, Bernady O. and Gervasi, Osvaldo},
  year = {2013},
  pages = {375--390},
  publisher = {Springer},
  address = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-39643-4_28},
  abstract = {The usage of metadata-based frameworks is becoming popular for some kinds of software, such as web and enterprise applications. They use domain-specific metadata, usually defined as annotations or in XML documents, to adapt its behavior to each application class. Despite of their increasingly usage, there are not a study that evaluated the consequences of their usage to the application. The present work presents the result of an experiment that aimed to compare the development of similar applications created: (a) without frameworks; (b) with a traditional framework; (c) with a metadata-based framework. As a result, it uses metrics and a qualitative evaluation to assess the benefits and drawbacks in the use of this kind of framework.},
  isbn = {978-3-642-39643-4},
  langid = {english}
}

@article{guyonIntroductionVariableFeature2003,
  title = {An Introduction to Variable and Feature Selection},
  author = {Guyon, Isabelle and Elisseeff, Andr{\'e}},
  year = {2003},
  month = mar,
  journal = {J. Mach. Learn. Res.},
  volume = {3},
  number = {null},
  pages = {1157--1182},
  issn = {1532-4435},
  abstract = {Variable and feature selection have become the focus of much research in areas of application for which datasets with tens or hundreds of thousands of variables are available. These areas include text processing of internet documents, gene expression array analysis, and combinatorial chemistry. The objective of variable selection is three-fold: improving the prediction performance of the predictors, providing faster and more cost-effective predictors, and providing a better understanding of the underlying process that generated the data. The contributions of this special issue cover a wide range of aspects of such problems: providing a better definition of the objective function, feature construction, feature ranking, multivariate feature selection, efficient search methods, and feature validity assessment methods.},
  file = {/home/michaelb/Zotero/storage/3GCBDL2W/Guyon and Elisseeff - 2003 - An introduction to variable and feature selection.pdf}
}

@article{hardwickeEmpiricalAssessmentTransparency2020,
  title = {An Empirical Assessment of Transparency and Reproducibility-Related Research Practices in the Social Sciences (2014--2017)},
  author = {Hardwicke, Tom E. and Wallach, Joshua D. and Kidwell, Mallory C. and Bendixen, Theiss and Cr{\"u}well, Sophia and Ioannidis, John P. A.},
  year = {2020},
  month = feb,
  journal = {Royal Society Open Science},
  volume = {7},
  number = {2},
  pages = {190806},
  issn = {2054-5703},
  doi = {10.1098/rsos.190806},
  urldate = {2025-08-03},
  abstract = {Serious concerns about research quality have catalysed a number of reform initiatives intended to improve transparency and reproducibility and thus facilitate self-correction, increase efficiency and enhance research credibility. Meta-research has evaluated the merits of some individual initiatives; however, this may not capture broader trends reflecting the cumulative contribution of these efforts. In this study, we manually examined a random sample of 250 articles in order to estimate the prevalence of a range of transparency and reproducibility-related indicators in the social sciences literature published between 2014 and 2017. Few articles indicated availability of materials (16/151, 11\% [95\% confidence interval, 7\% to 16\%]), protocols (0/156, 0\% [0\% to 1\%]), raw data (11/156, 7\% [2\% to 13\%]) or analysis scripts (2/156, 1\% [0\% to 3\%]), and no studies were pre-registered (0/156, 0\% [0\% to 1\%]). Some articles explicitly disclosed funding sources (or lack of; 74/236, 31\% [25\% to 37\%]) and some declared no conflicts of interest (36/236, 15\% [11\% to 20\%]). Replication studies were rare (2/156, 1\% [0\% to 3\%]). Few studies were included in evidence synthesis via systematic review (17/151, 11\% [7\% to 16\%]) or meta-analysis (2/151, 1\% [0\% to 3\%]). Less than half the articles were publicly available (101/250, 40\% [34\% to 47\%]). Minimal adoption of transparency and reproducibility-related research practices could be undermining the credibility and efficiency of social science research. The present study establishes a baseline that can be revisited in the future to assess progress.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/JQ5WY7L9/Hardwicke et al. - 2020 - An empirical assessment of transparency and reproducibility-related research practices in the social.pdf}
}

@article{hardwickeReducingBiasIncreasing2023,
  title = {Reducing Bias, Increasing Transparency and Calibrating Confidence with Preregistration},
  author = {Hardwicke, Tom E. and Wagenmakers, Eric-Jan},
  year = {2023},
  month = jan,
  journal = {Nature Human Behaviour},
  volume = {7},
  number = {1},
  pages = {15--26},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-022-01497-2},
  urldate = {2024-12-15},
  abstract = {Flexibility in the design, analysis and interpretation of scientific studies creates a multiplicity of possible research outcomes. Scientists are granted considerable latitude to selectively use and report the hypotheses, variables and analyses that create the most positive, coherent and attractive story while suppressing those that are negative or inconvenient. This creates a risk of bias that can lead to scientists fooling themselves and fooling others. Preregistration involves declaring a research plan (for example, hypotheses, design and statistical analyses) in a public registry before the research outcomes are known. Preregistration (1) reduces the risk of bias by encouraging outcome-independent decision-making and (2) increases transparency, enabling others to assess the risk of bias and calibrate their confidence in research outcomes. In this Perspective, we briefly review the historical evolution of preregistration in medicine, psychology and other domains, clarify its pragmatic functions, discuss relevant meta-research, and provide recommendations for scientists and journal editors.},
  copyright = {2022 Springer Nature Limited},
  langid = {english},
  keywords = {Science,Scientific community,technology and society},
  file = {/home/michaelb/Zotero/storage/W5FLB8LI/Hardwicke and Wagenmakers - 2023 - Reducing bias, increasing transparency and calibrating confidence with preregistration.pdf}
}

@book{hastieElementsStatisticalLearning2009,
  title = {The Elements of Statistical Learning: Data Mining, Inference, and Prediction},
  shorttitle = {The Elements of Statistical Learning},
  author = {Hastie, Trevor and Tibshirani, Robert and Friedman, J. H.},
  year = {2009},
  series = {Springer Series in Statistics},
  edition = {2nd ed},
  publisher = {Springer},
  address = {New York, NY},
  isbn = {978-0-387-84857-0 978-0-387-84858-7},
  lccn = {Q325.5 .H39 2009},
  keywords = {Bioinformatics,Computational intelligence,Data mining,Forecasting,Inference,Machine learning,Methodology,Statistics}
}

@incollection{hastieHighDimensionalProblems2009,
  title = {High-{{Dimensional Problems}}: p   {{N}}},
  shorttitle = {High-{{Dimensional Problems}}},
  booktitle = {The {{Elements}} of {{Statistical Learning}}: {{Data Mining}}, {{Inference}}, and {{Prediction}}},
  author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
  editor = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome},
  year = {2009},
  pages = {649--698},
  publisher = {Springer},
  address = {New York, NY},
  doi = {10.1007/978-0-387-84858-7_18},
  urldate = {2025-07-28},
  isbn = {978-0-387-84858-7},
  langid = {english}
}

@misc{hausteinWhenArticleActually2015,
  title = {When Is an Article Actually Published? {{An}} Analysis of Online Availability, Publication, and Indexation Dates},
  shorttitle = {When Is an Article Actually Published?},
  author = {Haustein, Stefanie and Bowman, Timothy D. and Costas, Rodrigo},
  year = {2015},
  month = may,
  number = {arXiv:1505.00796},
  eprint = {1505.00796},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1505.00796},
  urldate = {2025-07-15},
  abstract = {With the acceleration of scholarly communication in the digital era, the publication year is no longer a sufficient level of time aggregation for bibliometric and social media indicators. Papers are increasingly cited before they have been officially published in a journal issue and mentioned on Twitter within days of online availability. In order to find a suitable proxy for the day of online publication allowing for the computation of more accurate benchmarks and fine-grained citation and social media event windows, various dates are compared for a set of 58,896 papers published by Nature Publishing Group, PLOS, Springer and Wiley-Blackwell in 2012. Dates include the online date provided by the publishers, the month of the journal issue, the Web of Science indexing date, the date of the first tweet mentioning the paper as well as the Altmetric.com publication and first-seen dates. Comparing these dates, the analysis reveals that large differences exist between publishers, leading to the conclusion that more transparency and standardization is needed in the reporting of publication dates. The date on which the fixed journal article (Version of Record) is first made available on the publisher's website is proposed as a consistent definition of the online date.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Digital Libraries},
  file = {/home/michaelb/Zotero/storage/FBCCWWB9/Haustein et al. - 2015 - When is an article actually published An analysis of online availability, publication, and indexati.pdf;/home/michaelb/Zotero/storage/M9Q8JDS2/1505.html}
}

@article{havronPreregistrationInfantResearch2020,
  title = {Preregistration in Infant Research---{{A}} Primer},
  author = {Havron, Naomi and Bergmann, Christina and Tsuji, Sho},
  year = {2020},
  journal = {Infancy},
  volume = {25},
  number = {5},
  pages = {734--754},
  issn = {1532-7078},
  doi = {10.1111/infa.12353},
  urldate = {2024-11-06},
  abstract = {Preregistration, the act of specifying a research plan in advance, is becoming more common in scientific research. Infant researchers contend with unique problems that might make preregistration particularly challenging. Infants are a hard-to-reach population, usually yielding small sample sizes, they can only complete a limited number of trials, and they can be excluded based on hard-to-predict complications (e.g., parental interference, fussiness). In addition, as effects themselves potentially change with age and population, it is hard to calculate an a priori effect size. At the same time, these very factors make preregistration in infant studies a valuable tool. A priori examination of the planned study, including the hypotheses, sample size, and resulting statistical power, increases the credibility of single studies and adds value to the field. Preregistration might also improve explicit decision making to create better studies. We present an in-depth discussion of the issues uniquely relevant to infant researchers, and ways to contend with them in preregistration and study planning. We provide recommendations to researchers interested in following current best practices.},
  copyright = {{\copyright} 2020 International Congress of Infant Studies (ICIS)},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/7MTAJ6I2/Havron et al. - 2020 - Preregistration in infant research—A primer.pdf;/home/michaelb/Zotero/storage/DF3KLSUF/infa.html}
}

@article{haynesPresizeRpackagePrecisionbased2021,
  title = {`presize`: {{An R-package}} for Precision-Based Sample Size Calculation in Clinical Research},
  author = {Haynes, Alan G. and Lenz, Armando and Stalder, Odile and Limacher, Andreas},
  year = {2021},
  journal = {Journal of Open Source Software},
  volume = {6},
  number = {60},
  pages = {3118},
  doi = {10.21105/joss.03118}
}

@article{haynesPresizeRpackagePrecisionbased2021a,
  title = {`presize`: {{An R-package}} for Precision-Based Sample Size Calculation in Clinical Research},
  author = {Haynes, Alan G. and Lenz, Armando and Stalder, Odile and Limacher, Andreas},
  year = {2021},
  journal = {Journal of Open Source Software},
  volume = {6},
  number = {60},
  pages = {3118},
  doi = {10.21105/joss.03118}
}

@manual{henryRlangFunctionsBase2025,
  type = {Manual},
  title = {Rlang: {{Functions}} for Base Types and Core {{R}} and 'tidyverse' Features},
  author = {Henry, Lionel and Wickham, Hadley},
  year = {2025}
}

@book{hilbertMachineLearningEinfuehrung2025,
  title = {{Machine Learning: Eine Einf{\"u}hrung f{\"u}r Psychologie, Geistes- und Sozialwissenschaften}},
  shorttitle = {{Machine Learning}},
  author = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  year = {2025},
  series = {{Quantitative Sozialforschung}},
  publisher = {Springer Fachmedien},
  address = {Wiesbaden},
  doi = {10.1007/978-3-658-43649-0},
  urldate = {2025-08-01},
  copyright = {https://www.springernature.com/gp/researchers/text-and-data-mining},
  isbn = {978-3-658-43648-3 978-3-658-43649-0},
  langid = {ngerman},
  keywords = {Machine Learning,Methodenlehre,Quantitative Methoden,Sozialforschung,Statistik},
  file = {/home/michaelb/Zotero/storage/U8BD86QY/Hilbert et al. - 2025 - Machine Learning Eine Einführung für Psychologie, Geistes- und Sozialwissenschaften.pdf}
}

@incollection{hilbertModelle2025,
  title = {{Modelle}},
  booktitle = {{Machine Learning: Eine Einf{\"u}hrung f{\"u}r Psychologie, Geistes- und Sozialwissenschaften}},
  author = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  editor = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  year = {2025},
  pages = {53--113},
  publisher = {Springer Fachmedien},
  address = {Wiesbaden},
  doi = {10.1007/978-3-658-43649-0_5},
  urldate = {2025-08-01},
  abstract = {Der Einsatz von ML umfasst eine Vielzahl von m{\"o}glichen Lernalgorithmen und Modellen, die f{\"u}r verschiedene Datensituationen unterschiedlich gut geeignet sind. Es gibt keine grunds{\"a}tzlich ,,besseren`` oder ,,schlechteren`` Modelle, sondern lediglich solche, die besser oder schlechter f{\"u}r eine bestimmte Aufgabe geeignet sind. Welches Modell am besten passt, h{\"a}ngt von zahlreichen Faktoren ab, insbesondere von der Struktur der Daten. In diesem Kapitel werden einige der wichtigsten Modelle vorgestellt. Die Auswahl ist keineswegs exhaustiv, sondern an den g{\"a}ngigsten Anwendungsfeldern orientiert und k{\"o}nnte nahezu beliebig erweitert werden. Sie soll als erster {\"U}berblick verschiedener Ans{\"a}tze dienen und einen Einstieg in die Vielfalt der ML-Modelle bieten. Behandelt werden in diesem Kapitel die regularisierten Regressionsmodelle Least Absolute Shrinkage and Selection Operation (LASSO) und Ridge, Random Forests, Boosting, Support Vector Machine sowie artifizielle Neuronale Netzwerke. F{\"u}r diese Modelle werden Grundidee, Modellsch{\"a}tzung, Optimierung, Hyperparameter-Tuning und die Interpretation von Parametern vorgestellt.},
  isbn = {978-3-658-43649-0},
  langid = {ngerman},
  keywords = {Artifizielle Neuronale Netzwerke,Boosting,Random forest,Regressionsmodelle,Support vector machine},
  file = {/home/michaelb/Zotero/storage/GVC9N8B2/Hilbert et al. - 2025 - Modelle.pdf}
}

@incollection{hilbertOptimierung2025,
  title = {{Optimierung}},
  booktitle = {{Machine Learning: Eine Einf{\"u}hrung f{\"u}r Psychologie, Geistes- und Sozialwissenschaften}},
  author = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  editor = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  year = {2025},
  pages = {47--52},
  publisher = {Springer Fachmedien},
  address = {Wiesbaden},
  doi = {10.1007/978-3-658-43649-0_4},
  urldate = {2025-08-01},
  abstract = {Im Kontext des ML bezeichnet die Optimierung die iterative Anpassung der Modellparameter. Der Optimierungsprozess zielt darauf ab, die Vorhersagegenauigkeit eines Modells zu erh{\"o}hen, indem die Diskrepanz zwischen den Vorhersagen und den tats{\"a}chlichen Werten -- gemessen durch eine Loss-Funktion -- minimiert wird. Die Anpassung der Parameter erfolgt schrittweise anhand eines gew{\"a}hlten Optimierers, der Richtung und Schrittweite der {\"A}nderungen vorgibt. Zus{\"a}tzlich wird das Hyperparameter-Tuning als wichtiger Aspekt der Modelloptimierung beschrieben. Hyperparameter, die nicht aus den Daten gelernt, sondern durch Trial-and-Error festgelegt werden, beeinflussen den maschinellen Lernprozess ma{\ss}geblich. Methoden zur Hyperparametersuche umfassen Grid Search, Random Search und Bayesianische Optimierung. W{\"a}hrend Grid Search systematisch alle Kombinationen und Random Search zuf{\"a}llige Kombinationen testet, zielt die Bayesianische Optimierung darauf ab, aus fr{\"u}heren Iterationen zu lernen und die Suche zu beschleunigen. Hyperparameter-Tuning ist essenziell zur Optimierung der Modellleistung, allerdings ist es computational aufwendig und erfordert Erfahrung mit ML.},
  isbn = {978-3-658-43649-0},
  langid = {ngerman},
  keywords = {Bayesianische Optimierung,Grid search,Hyperparameter-tuning,Loss-Funktion,Modellparameter},
  file = {/home/michaelb/Zotero/storage/GLKJKKKW/Hilbert et al. - 2025 - Optimierung.pdf}
}

@incollection{hilbertPreprocessing2025,
  title = {{Preprocessing}},
  booktitle = {{Machine Learning: Eine Einf{\"u}hrung f{\"u}r Psychologie, Geistes- und Sozialwissenschaften}},
  author = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  editor = {Hilbert, Sven and Kraus, Elisabeth and Lindl, Alfred},
  year = {2025},
  pages = {37--46},
  publisher = {Springer Fachmedien},
  address = {Wiesbaden},
  doi = {10.1007/978-3-658-43649-0_3},
  urldate = {2025-08-01},
  abstract = {Preprocessing, also die Vorverarbeitung von Rohdaten, ist ein zentraler Bestandteil von ML. Daten m{\"u}ssen so aufbereitet werden, dass Modelle effizient und pr{\"a}zise trainiert werden k{\"o}nnen. Variablen m{\"u}ssen h{\"a}ufig zusammengefasst, umkodiert oder transformiert werden, um f{\"u}r ML-Algorithmen n{\"u}tzlich zu sein. Beispielsweise erfordert die Analyse von Logdaten eine Umrechnung in nutzbare Features wie etwa die Berechnung der Zeitdauer zwischen Ereignissen. Auch psychometrische Daten wie Skalen aus Frageb{\"o}gen m{\"u}ssen aufbereitet werden, um interpretierbare Skalenwerte oder latente Faktoren zu erhalten. Ein wichtiger Schritt im Preprocessing ist das Feature Engineering, das die Erstellung und Transformation von Variablen durch mathematische Funktionen umfasst. Methoden zur Reduktion von Features wie die Hauptkomponentenanalyse oder die konfirmatorische Faktorenanalyse helfen, {\"u}berfl{\"u}ssige Informationen zu entfernen und die Modellleistung zu verbessern. Preprocessing wird h{\"a}ufig in Analysepipelines integriert, um verschiedene Schritte zu automatisieren und datengesteuert zu optimieren.},
  isbn = {978-3-658-43649-0},
  langid = {ngerman},
  keywords = {Analysepipelines,Datenaufbereitung,Feature engineering,Feature extraction,Preprocessing},
  file = {/home/michaelb/Zotero/storage/ADB3XZRX/Hilbert et al. - 2025 - Preprocessing.pdf}
}

@article{houserFairnessBiasPeer2022,
  title = {Fairness and {{Bias}} in {{Peer Review}}: {{Anonymity}}, {{Open Science}}, and {{Preprints}}},
  shorttitle = {Fairness and {{Bias}} in {{Peer Review}}},
  author = {Houser, Kevin W.},
  year = {2022},
  month = oct,
  journal = {LEUKOS},
  volume = {18},
  number = {4},
  pages = {415--416},
  publisher = {Taylor \& Francis},
  issn = {1550-2724},
  doi = {10.1080/15502724.2022.2103367},
  urldate = {2025-08-26},
  file = {/home/michaelb/Zotero/storage/BKA8K723/Houser - 2022 - Fairness and Bias in Peer Review Anonymity, Open Science, and Preprints.pdf}
}

@article{huangApplicationStatisticalInference2018,
  title = {Application of {{Statistical Inference}} in {{Education}} and {{Teaching}}},
  author = {Huang, Zhaoxia},
  year = {2018},
  journal = {Educational Sciences: Theory \& Practice},
  publisher = {Egitim Danismanligi ve Arastirmalari (EDAM)},
  issn = {2148-7561},
  doi = {10.12738/estp.2018.6.179},
  urldate = {2025-07-16}
}

@article{hughesMeanAccuracyStatistical1968,
  title = {On the Mean Accuracy of Statistical Pattern Recognizers},
  author = {Hughes, G.},
  year = {1968},
  month = jan,
  journal = {IEEE Transactions on Information Theory},
  volume = {14},
  number = {1},
  pages = {55--63},
  issn = {1557-9654},
  doi = {10.1109/TIT.1968.1054102},
  urldate = {2025-07-28},
  abstract = {The overall mean recognition probability (mean accuracy) of a pattern classifier is calculated and numerically plotted as a function of the pattern measurement complexity n and design data set sizem. Utilized is the well-known probabilistic model of a two-class, discrete-measurement pattern environment (no Gaussian or statistical independence assumptions are made). The minimum-error recognition rule (Bayes) is used, with the unknown pattern environment probabilities estimated from the data relative frequencies. In calculating the mean accuracy over all such environments, only three parameters remain in the final equation:n, m, and the prior probabilityp\_cof either of the pattern classes. With a fixed design pattern sample, recognition accuracy can first increase as the number of measurements made on a pattern increases, but decay with measurement complexity higher than some optimum value. Graphs of the mean accuracy exhibit both an optimal and a maximum acceptable value ofnfor fixedmandp\_c. A four-place tabulation of the optimumnand maximum mean accuracy values is given for equally likely classes andmranging from2to1000. The penalty exacted for the generality of the analysis is the use of the mean accuracy itself as a recognizer optimality criterion. Namely, one necessarily always has some particular recognition problem at hand whose Bayes accuracy will be higher or lower than the mean over all recognition problems having fixedn, m, andp\_c.},
  keywords = {Bayes methods,Complexity theory,Numerical models,Pattern recognition,Probability,Standards,Vectors},
  file = {/home/michaelb/Zotero/storage/7CDQDHXX/Hughes - 1968 - On the mean accuracy of statistical pattern recognizers.pdf;/home/michaelb/Zotero/storage/X2YWK2VN/1054102.html}
}

@manual{hvitfeldtDiscrimModelWrappers2023,
  type = {Manual},
  title = {Discrim: {{Model}} Wrappers for Discriminant Analysis},
  author = {Hvitfeldt, Emil and Kuhn, Max},
  year = {2023}
}

@manual{hvitfeldtDiscrimModelWrappers2023a,
  type = {Manual},
  title = {Discrim: {{Model}} Wrappers for Discriminant Analysis},
  author = {Hvitfeldt, Emil and Kuhn, Max},
  year = {2023}
}

@manual{hvitfeldtTextrecipesExtraRecipes2025,
  type = {Manual},
  title = {Textrecipes: {{Extra}} 'recipes' for Text Processing},
  author = {Hvitfeldt, Emil},
  year = {2025}
}

@manual{hvitfeldtThemisExtraRecipes2025,
  type = {Manual},
  title = {Themis: {{Extra}} Recipes Steps for Dealing with Unbalanced Data},
  author = {Hvitfeldt, Emil},
  year = {2025},
  doi = {10.32614/CRAN.package.themis}
}

@article{ioannidisWhyMostPublished2005,
  title = {Why {{Most Published Research Findings Are False}}},
  author = {Ioannidis, John P. A.},
  year = {2005},
  month = aug,
  journal = {PLOS Medicine},
  volume = {2},
  number = {8},
  pages = {e124},
  publisher = {Public Library of Science},
  issn = {1549-1676},
  doi = {10.1371/journal.pmed.0020124},
  urldate = {2025-08-23},
  abstract = {Summary There is increasing concern that most current published research findings are false. The probability that a research claim is true may depend on study power and bias, the number of other studies on the same question, and, importantly, the ratio of true to no relationships among the relationships probed in each scientific field. In this framework, a research finding is less likely to be true when the studies conducted in a field are smaller; when effect sizes are smaller; when there is a greater number and lesser preselection of tested relationships; where there is greater flexibility in designs, definitions, outcomes, and analytical modes; when there is greater financial and other interest and prejudice; and when more teams are involved in a scientific field in chase of statistical significance. Simulations show that for most study designs and settings, it is more likely for a research claim to be false than true. Moreover, for many current scientific fields, claimed research findings may often be simply accurate measures of the prevailing bias. In this essay, I discuss the implications of these problems for the conduct and interpretation of research.},
  langid = {english},
  keywords = {Cancer risk factors,Finance,Genetic epidemiology,Genetics of disease,Metaanalysis,Randomized controlled trials,Research design,Schizophrenia},
  file = {/home/michaelb/Zotero/storage/AZSPC783/Ioannidis - 2005 - Why Most Published Research Findings Are False.pdf}
}

@misc{iris_53,
  title = {Iris},
  author = {Fisher, R. A.},
  year = {1936},
  howpublished = {UCI Machine Learning Repository}
}

@misc{ISO263242025,
  title = {{{ISO}} 26324:2025},
  shorttitle = {{{ISO}} 26324},
  journal = {ISO},
  urldate = {2025-04-11},
  abstract = {Information and documentation --- Digital object identifier system},
  howpublished = {https://www.iso.org/standard/88862.html},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/KJN87XUH/88862.html}
}

@article{jainRecentDevelopmentsGame2024,
  title = {Recent {{Developments}} of {{Game Theory}} and {{Reinforcement Learning Approaches}}: {{A Systematic Review}}},
  shorttitle = {Recent {{Developments}} of {{Game Theory}} and {{Reinforcement Learning Approaches}}},
  author = {Jain, Garima and Kumar, Arun and Bhat, Shahid Ahmad},
  year = {2024},
  journal = {IEEE Access},
  volume = {12},
  pages = {9999--10011},
  issn = {2169-3536},
  doi = {10.1109/ACCESS.2024.3352749},
  urldate = {2025-07-22},
  abstract = {In the ever-changing world of decision-making, when game theory and reinforcement learning(RL) come together, they create a fascinating combination that shows a new way to solve complex problems in many fields. The combination of game theory and RL is a powerful convergence that opens up a hopeful new frontier for dealing with complex decision-making problems in many different fields. Research on the convergence of game theory and RL has shown to be beneficial, providing essential insights into challenging decision-making issues in various disciplines. This study investigates the recent developments of game theory and RL approaches through a systematic review and highlights the significance of game theory in boosting reinforcement algorithms and increasing the interaction of autonomous vehicles, safeguarding edge caching, and more. It offers a thorough account of the developments at the confluence of game theory and RL. The reviewed papers mainly focus on broad themes and address three important research questions: the impact of game theory on multi-agent reinforcement learning (MARL), the significant contributions of game theory to RL, and the significant impact areas. Following the methodology, search outcomes, and study areas is a discussion on game theory-related terminology, followed by study findings. The review's conclusions offer ideas for further study and open research questions. The importance of game theory in advancing MARL, the potential of game theory in promoting RL strategies, and the opportunities for combining game theory and RL in cutting-edge fields like mobile edge caching and cyber-physical systems(CPS) are all emphasized in the conclusion. This review article advances our knowledge of the theoretical underpinnings and real-world applications of game theory and RL, laying the groundwork for future improvements in decision-making techniques and algorithms.},
  keywords = {autonomous vehicles,Autonomous vehicles,Behavioral sciences,Cache storage,cyber-physical systems,Decision making,decision-making,edge caching,Game theory,multi-agent reinforcement learning,Multi-agent systems,reinforcement learning,Reinforcement learning,Table lookup,Terminology},
  file = {/home/michaelb/Zotero/storage/J46N8W3W/Jain et al. - 2024 - Recent Developments of Game Theory and Reinforcement Learning Approaches A Systematic Review.pdf}
}

@book{jamesIntroductionStatisticalLearning2021,
  title = {An Introduction to Statistical Learning: With Applications in {{R}}},
  shorttitle = {An Introduction to Statistical Learning},
  author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
  year = {2021},
  series = {Springer Texts in Statistics},
  edition = {Second edition},
  publisher = {Springer},
  address = {New York, NY},
  isbn = {978-1-0716-1420-4 978-1-0716-1417-4},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/K8RHS4AF/ISLRv2_corrected_June_2023.pdf;/home/michaelb/Zotero/storage/TNTP2LJ8/James et al. - 2021 - An introduction to statistical learning with applications in R.pdf}
}

@article{jandotInteractiveSemanticFeaturing2016,
  title = {Interactive {{Semantic Featuring}} for {{Text Classification}}},
  author = {Jandot, Camille and Simard, Patrice Y. and Chickering, D. M. and Grangier, David and Suh, Jina},
  year = {2016},
  month = jun,
  journal = {ArXiv},
  urldate = {2024-12-16},
  abstract = {In text classification, dictionaries can be used to define human-comprehensible features. We propose an improvement to dictionary features called smoothed dictionary features. These features recognize document contexts instead of n-grams. We describe a principled methodology to solicit dictionary features from a teacher, and present results showing that models built using these human-comprehensible features are competitive with models trained with Bag of Words features.},
  file = {/home/michaelb/Zotero/storage/UVTS96I8/Jandot et al. - 2016 - Interactive Semantic Featuring for Text Classification.pdf}
}

@article{janssenBenefitsAdoptionBarriers2012,
  title = {Benefits, {{Adoption Barriers}} and {{Myths}} of {{Open Data}} and {{Open Government}}},
  author = {Janssen, Marijn and Charalabidis, Yannis and Zuiderwijk, Anneke},
  year = {2012},
  month = sep,
  journal = {Information Systems Management},
  volume = {29},
  number = {4},
  pages = {258--268},
  publisher = {Taylor \& Francis},
  issn = {1058-0530},
  doi = {10.1080/10580530.2012.716740},
  urldate = {2025-07-16},
  abstract = {In this article, based on data collected through interviews and a workshop, the benefits and adoption barriers for open data have been derived. The results suggest that a conceptually simplistic view is often adopted with regard to open data, which automatically correlates the publicizing of data with use and benefits. Also, five ``myths'' concerning open data are presented, which place the expectations within a realistic perspective. Further, the recommendation is provided that such projects should take a user's view.},
  keywords = {adoption,diffusion,governance,institutional theory,open data,open government,systems theory,transformation},
  file = {/home/michaelb/Zotero/storage/HE6MN5UY/Janssen et al. - 2012 - Benefits, Adoption Barriers and Myths of Open Data and Open Government.pdf}
}

@article{jarolimkovaDataSharingIntegral2023,
  title = {Data Sharing: An Integral Part of Research Practice?},
  shorttitle = {Data Sharing},
  author = {Jarolimkova, Adela},
  year = {2023},
  month = dec,
  journal = {Qualitative and Quantitative Methods in Libraries},
  volume = {12},
  number = {4},
  pages = {609--620},
  issn = {2241-1925},
  urldate = {2024-12-15},
  abstract = {Sharing research data is now recognised as an integral part of scientific work and as a service to the public, contributing to the development of knowledge and the transparency of research. However, as many studies have shown, data sharing policies and practices vary widely across disciplines, countries; and funding bodies, and ultimately depend on the motivation and attitudes of individual researchers. The author focuses on researchers' attitudes to data sharing, drawing on an extensive literature review of data sharing studies. The author describes the factors that influence researchers' data sharing at an individual level, and the motivations and barriers that prevent effective access to data.},
  copyright = {Copyright (c) 2023 Qualitative and Quantitative Methods in Libraries},
  langid = {english},
  keywords = {attitudes,barriers,data sharing,motivation},
  file = {/home/michaelb/Zotero/storage/XYDAMD7M/Jarolimkova - 2023 - Data sharing an integral part of research practice.pdf}
}

@article{jensenThereReplicationCrisis2023,
  title = {Is {{There}} a {{Replication Crisis}} in {{Finance}}?},
  author = {Jensen, Theis Ingerslev and Kelly, Bryan and Pedersen, Lasse Heje},
  year = {2023},
  journal = {The Journal of Finance},
  volume = {78},
  number = {5},
  pages = {2465--2518},
  issn = {1540-6261},
  doi = {10.1111/jofi.13249},
  urldate = {2025-08-23},
  abstract = {Several papers argue that financial economics faces a replication crisis because the majority of studies cannot be replicated or are the result of multiple testing of too many factors. We develop and estimate a Bayesian model of factor replication that leads to different conclusions. The majority of asset pricing factors (i) can be replicated; (ii) can be clustered into 13 themes, the majority of which are significant parts of the tangency portfolio; (iii) work out-of-sample in a new large data set covering 93 countries; and (iv) have evidence that is strengthened (not weakened) by the large number of observed factors.},
  copyright = {{\copyright} 2023 The Authors. The Journal of Finance published by Wiley Periodicals LLC on behalf of American Finance Association.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/8Z8LT8CB/Jensen et al. - 2023 - Is There a Replication Crisis in Finance.pdf}
}

@article{johnMeasuringPrevalenceQuestionable2012a,
  title = {Measuring the {{Prevalence}} of {{Questionable Research Practices With Incentives}} for {{Truth Telling}}},
  author = {John, Leslie K. and Loewenstein, George and Prelec, Drazen},
  year = {2012},
  month = may,
  journal = {Psychological Science},
  volume = {23},
  number = {5},
  pages = {524--532},
  issn = {0956-7976, 1467-9280},
  doi = {10.1177/0956797611430953},
  urldate = {2025-08-26},
  abstract = {Cases of clear scientific misconduct have received significant media attention recently, but less flagrantly questionable research practices may be more prevalent and, ultimately, more damaging to the academic enterprise. Using an anonymous elicitation format supplemented by incentives for honest reporting, we surveyed over 2,000 psychologists about their involvement in questionable research practices. The impact of truth-telling incentives on self-admissions of questionable research practices was positive, and this impact was greater for practices that respondents judged to be less defensible. Combining three different estimation methods, we found that the percentage of respondents who have engaged in questionable practices was surprisingly high. This finding suggests that some questionable practices may constitute the prevailing research norm.},
  langid = {english}
}

@article{johnsonLinearStatisticalInference1966,
  title = {Linear {{Statistical Inference}} and {{Its Applications}}},
  author = {Johnson, N. L.},
  year = {1966},
  month = aug,
  journal = {Technometrics},
  volume = {8},
  number = {3},
  pages = {551--553},
  publisher = {Informa UK Limited},
  issn = {0040-1706, 1537-2723},
  doi = {10.1080/00401706.1966.10490390},
  urldate = {2025-07-16},
  langid = {english},
  keywords = {definition: statistical inference}
}

@article{johnsonPreregistrationSingleCaseDesign2019,
  title = {Preregistration in {{Single-Case Design Research}}},
  author = {Johnson, Austin H. and Cook, Bryan G.},
  year = {2019},
  month = oct,
  journal = {Exceptional Children},
  volume = {86},
  number = {1},
  pages = {95--112},
  publisher = {SAGE Publications Inc},
  issn = {0014-4029},
  doi = {10.1177/0014402919868529},
  urldate = {2024-11-06},
  abstract = {To draw informed conclusions from research studies, research consumers need full and accurate descriptions of study methods and procedures. Preregistration has been proposed as a means to clarify reporting of research methods and procedures, with the goal of reducing bias in research. However, preregistration has been applied primarily to research studies utilizing group designs. In this article, we discuss general issues in preregistration and consider the use of preregistration in single-case design research, particularly as it relates to differing applications of this methodology. We then provide a rationale and make specific recommendations for preregistering single-case design research, including guidelines for preregistering basic descriptive information, research questions, participant characteristics, baseline conditions, independent and dependent variables, hypotheses, and phase-change decisions.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/Z34LN54E/Johnson and Cook - 2019 - Preregistration in Single-Case Design Research.pdf}
}

@article{karatzoglouKernlabS4Package2004,
  title = {{\textbf{Kernlab}} - {{An}} {{{\emph{S4}}}} {{Package}} for {{Kernel Methods}} in {{{\emph{R}}}}},
  author = {Karatzoglou, Alexandros and Smola, Alex and Hornik, Kurt and Zeileis, Achim},
  year = {2004},
  journal = {Journal of Statistical Software},
  volume = {11},
  number = {9},
  issn = {1548-7660},
  doi = {10.18637/jss.v011.i09},
  urldate = {2025-08-01},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/QU7B8N2X/Karatzoglou et al. - 2004 - kernlab - An S4 Package for Kernel Methods in R.pdf}
}

@incollection{kayeFrequentistMethodsStatistical2020,
  title = {Frequentist {{Methods}} for {{Statistical Inference}}},
  booktitle = {Handbook of {{Forensic Statistics}}},
  author = {Kaye, David H.},
  year = {2020},
  publisher = {{Chapman and Hall/CRC}},
  abstract = {This chapter describes commonly used concepts for assessing statistical error-confidence intervals, p-values, and hypothesis tests. It outlines the logic underlying resampling methods. The chapter identifies common misinterpretations of computed quantities, and discusses some of the comparative advantages and disadvantages of using confidence intervals, p-values, classical hypothesis tests, and likelihood ratios for various purposes in forensic science. Along with idealized, simple examples of probabilistic processes, the chapter uses two principal examples from forensic science to illustrate the classical methods. The first involves an experiment to ascertain the validity and false positive probability of identifications made by latent fingerprint examiners. The second involves measurements of the refractive index of glass fragments. Confidence intervals are useful for presenting an estimate of the parameter value and for conveying a sense of the statistical error that is involved in making that estimate from the sample data.},
  isbn = {978-0-367-52770-9}
}

@article{kempPerceptualGroupingExplains2022,
  title = {Perceptual {{Grouping Explains Similarities}} in {{Constellations Across Cultures}}},
  author = {Kemp, Charles and Hamacher, Duane W. and Little, Daniel R. and Cropper, Simon J.},
  year = {2022},
  month = mar,
  journal = {Psychological Science},
  volume = {33},
  number = {3},
  pages = {354--363},
  publisher = {SAGE Publications Inc},
  issn = {0956-7976},
  doi = {10.1177/09567976211044157},
  urldate = {2025-07-26},
  abstract = {Cultures around the world organize stars into constellations, or asterisms, and these groupings are often considered to be arbitrary and culture specific. Yet there are striking similarities in asterisms across cultures, and groupings such as Orion, the Big Dipper, the Pleiades, and the Southern Cross are widely recognized across many different cultures. Psychologists have informally suggested that these shared patterns are explained by Gestalt laws of grouping, but there have been no systematic attempts to catalog asterisms that recur across cultures or to explain the perceptual basis of these groupings. Here, we compiled data from 27 cultures around the world and found that a simple computational model of perceptual grouping accounts for many of the recurring cross-cultural asterisms. Our results suggest that basic perceptual principles account for more of the structure of asterisms across cultures than previously acknowledged and highlight ways in which specific cultures depart from this shared baseline.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/ZN64SM9I/Kemp et al. - 2022 - Perceptual Grouping Explains Similarities in Constellations Across Cultures.pdf}
}

@inproceedings{khalidSurveyFeatureSelection2014,
  title = {A Survey of Feature Selection and Feature Extraction Techniques in Machine Learning},
  booktitle = {2014 {{Science}} and {{Information Conference}}},
  author = {Khalid, Samina and Khalil, Tehmina and Nasreen, Shamila},
  year = {2014},
  month = aug,
  pages = {372--378},
  doi = {10.1109/SAI.2014.6918213},
  urldate = {2025-07-25},
  abstract = {Dimensionality reduction as a preprocessing step to machine learning is effective in removing irrelevant and redundant data, increasing learning accuracy, and improving result comprehensibility. However, the recent increase of dimensionality of data poses a severe challenge to many existing feature selection and feature extraction methods with respect to efficiency and effectiveness. In the field of machine learning and pattern recognition, dimensionality reduction is important area, where many approaches have been proposed. In this paper, some widely used feature selection and feature extraction techniques have analyzed with the purpose of how effectively these techniques can be used to achieve high performance of learning algorithms that ultimately improves predictive accuracy of classifier. An endeavor to analyze dimensionality reduction techniques briefly with the purpose to investigate strengths and weaknesses of some widely used dimensionality reduction methods is presented.},
  keywords = {Accuracy,Age Related Macula Degeneration (AMD),Algorithm design and analysis,Correlation,Correlation Based Method,Feature extraction,Feature Extraction/Transformation,Feature Selection,Feature Subset Selection,FSA's,ICA,Noise,PCA,Principal component analysis,Redundancy,RELIEF},
  file = {/home/michaelb/Zotero/storage/GAAT6ZNB/Khalid et al. - 2014 - A survey of feature selection and feature extraction techniques in machine learning.pdf}
}

@article{khanSocialOrganizationSexual2020,
  title = {{The Social Organization of Sexual Assault}},
  author = {Khan, Shamus and Greene, Joss and Mellins, Claude Ann and Hirsch, Jennifer S.},
  year = {2020},
  month = jan,
  journal = {Annual Review of Criminology},
  volume = {3},
  pages = {139--163},
  publisher = {Annual Reviews Inc.},
  doi = {10.1146/annurev-criminol-011518-024456},
  urldate = {2025-07-25},
  langid = {English (US)},
  file = {/home/michaelb/Zotero/storage/SWR38X8J/Khan et al. - 2020 - The Social Organization of Sexual Assault.pdf}
}

@article{kimResearchPaperClassification2019,
  title = {Research Paper Classification Systems Based on {{TF-IDF}} and {{LDA}} Schemes},
  author = {Kim, Sang-Woon and Gil, Joon-Min},
  year = {2019},
  month = aug,
  journal = {Human-centric Computing and Information Sciences},
  volume = {9},
  number = {1},
  pages = {30},
  issn = {2192-1962},
  doi = {10.1186/s13673-019-0192-7},
  urldate = {2024-12-16},
  abstract = {With the increasing advance of computer and information technologies, numerous research papers have been published online as well as offline, and as new research fields have been continuingly created, users have a lot of trouble in finding and categorizing their interesting research papers. In order to overcome the limitations, this paper proposes a research paper classification system that can cluster research papers into the meaningful class in which papers are very likely to have similar subjects. The proposed system extracts representative keywords from the abstracts of each paper and topics by Latent Dirichlet allocation (LDA) scheme. Then, the K-means clustering algorithm is applied to classify the whole papers into research papers with similar subjects, based on the Term frequency-inverse document frequency (TF-IDF) values of each paper.},
  langid = {english},
  keywords = {Artificial Intelligence,K-means clustering,LDA,Paper classification,TF-IDF},
  file = {/home/michaelb/Zotero/storage/23YFBPYR/Kim and Gil - 2019 - Research paper classification systems based on TF-IDF and LDA schemes.pdf}
}

@inproceedings{kohaviStudyCrossvalidationBootstrap1995,
  title = {A Study of Cross-Validation and {{Bootstrap}} for Accuracy Estimation and Model Selection},
  booktitle = {Proceedings of the {{International Joint Conference}} on {{Artificial Intelligence}} ({{IJCAI}})},
  author = {Kohavi, Ron},
  year = {1995},
  pages = {1137--1143},
  publisher = {Morgan Kaufmann},
  keywords = {imported},
  file = {/home/michaelb/Zotero/storage/HVPRBMY9/Kohavi - 1995 - A study of cross-validation and Bootstrap for accuracy estimation and model selection.pdf}
}

@article{korbmacherReplicationCrisisHas2023,
  title = {The Replication Crisis Has Led to Positive Structural, Procedural, and Community Changes},
  author = {Korbmacher, Max and Azevedo, Flavio and Pennington, Charlotte R. and Hartmann, Helena and Pownall, Madeleine and Schmidt, Kathleen and Elsherif, Mahmoud and Breznau, Nate and Robertson, Olly and Kalandadze, Tamara and Yu, Shijun and Baker, Bradley J. and O'Mahony, Aoife and Olsnes, J{\o}rgen {\O}-S. and Shaw, John J. and Gjoneska, Biljana and Yamada, Yuki and R{\"o}er, Jan P. and Murphy, Jennifer and Alzahawi, Shilaan and Grinschgl, Sandra and Oliveira, Catia M. and Wingen, Tobias and Yeung, Siu Kit and Liu, Meng and K{\"o}nig, Laura M. and {Albayrak-Aydemir}, Nihan and Lecuona, Oscar and Micheli, Leticia and Evans, Thomas},
  year = {2023},
  month = jul,
  journal = {Communications Psychology},
  volume = {1},
  number = {1},
  pages = {3},
  publisher = {Nature Publishing Group},
  issn = {2731-9121},
  doi = {10.1038/s44271-023-00003-2},
  urldate = {2025-08-23},
  abstract = {The emergence of large-scale replication projects yielding successful rates substantially lower than expected caused the behavioural, cognitive, and social sciences to experience a so-called `replication crisis'. In this Perspective, we reframe this `crisis' through the lens of a credibility revolution, focusing on positive structural, procedural and community-driven changes. Second, we outline a path to expand ongoing advances and improvements. The credibility revolution has been an impetus to several substantive changes which will have a positive, long-term impact on our research environment.},
  copyright = {2023 The Author(s)},
  langid = {english},
  keywords = {Psychology,Publication characteristics,Scientific community},
  file = {/home/michaelb/Zotero/storage/SG2KTZFV/Korbmacher et al. - 2023 - The replication crisis has led to positive structural, procedural, and community changes.pdf}
}

@article{krahmerCareShareExperimental2023,
  title = {Care to Share? {{Experimental}} Evidence on Code Sharing Behavior in the Social Sciences},
  shorttitle = {Care to Share?},
  author = {Kr{\"a}hmer, Daniel and Sch{\"a}chtele, Laura and Schneck, Andreas},
  year = {2023},
  month = aug,
  journal = {PLOS ONE},
  volume = {18},
  number = {8},
  pages = {e0289380},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0289380},
  urldate = {2025-08-03},
  abstract = {Transparency and peer control are cornerstones of good scientific practice and entail the replication and reproduction of findings. The feasibility of replications, however, hinges on the premise that original researchers make their data and research code publicly available. This applies in particular to large-N observational studies, where analysis code is complex and may involve several ambiguous analytical decisions. To investigate which specific factors influence researchers' code sharing behavior upon request, we emailed code requests to 1,206 authors who published research articles based on data from the European Social Survey between 2015 and 2020. In this preregistered multifactorial field experiment, we randomly varied three aspects of our code request's wording in a 2x4x2 factorial design: the overall framing of our request (enhancement of social science research, response to replication crisis), the appeal why researchers should share their code (FAIR principles, academic altruism, prospect of citation, no information), and the perceived effort associated with code sharing (no code cleaning required, no information). Overall, 37.5\% of successfully contacted authors supplied their analysis code. Of our experimental treatments, only framing affected researchers' code sharing behavior, though in the opposite direction we expected: Scientists who received the negative wording alluding to the replication crisis were more likely to share their research code. Taken together, our results highlight that the availability of research code will hardly be enhanced by small-scale individual interventions but instead requires large-scale institutional norms.},
  langid = {english},
  keywords = {Altruistic behavior,Computer software,Decision making,Experimental design,Open data,Research design,Social research,Social sciences},
  file = {/home/michaelb/Zotero/storage/VKEIQPCQ/Krähmer et al. - 2023 - Care to share Experimental evidence on code sharing behavior in the social sciences.pdf}
}

@article{kraussDebunkingRevolutionaryParadigm2024,
  title = {Debunking Revolutionary Paradigm Shifts: Evidence of Cumulative Scientific Progress across Science},
  shorttitle = {Debunking Revolutionary Paradigm Shifts},
  author = {Krauss, Alexander},
  year = {2024},
  month = nov,
  journal = {Proceedings of the Royal Society A: Mathematical, Physical and Engineering Sciences},
  volume = {480},
  number = {2302},
  pages = {20240141},
  publisher = {Royal Society},
  doi = {10.1098/rspa.2024.0141},
  urldate = {2024-12-13},
  abstract = {How can scientific progress be conceived best? Does science mainly undergo revolutionary paradigm shifts? Or is the evolution of science mainly cumulative? Understanding whether science advances through cumulative evolution or through paradigm shifts can influence how we approach scientific research, education and policy. The most influential and cited account of science was put forth in Thomas Kuhn's seminal book The structure of scientific revolutions. Kuhn argues that science does not advance cumulatively but goes through fundamental paradigm changes in the theories of a scientific field. There is no consensus yet on this core question of the nature and advancement of science that has since been debated across science. Examining over 750 major scientific discoveries (all Nobel Prize and major non-Nobel Prize discoveries), we systematically test this fundamental question about scientific progress here. We find that three key measures of scientific progress---major discoveries, methods and fields---each demonstrate that science evolves cumulatively. First, we show that no major scientific methods or instruments used across fields (such as statistical methods, X-ray methods or chromatography) have been completely abandoned, i.e. subject to paradigm shifts. Second, no major scientific fields (such as biomedicine, chemistry or computer science) have been completely abandoned. Rather, they have all continuously expanded over time, often over centuries, accumulating extensive bodies of knowledge. Third, scientific discoveries including theoretical discoveries are also predominately cumulative, with only 1\% of over 750 major discoveries having been abandoned. The continuity of science is most compellingly evidenced by our methods and instruments, which enable the creation of discoveries and fields. We thus offer here a new perspective and answer to this classic question in science and the philosophy and history of science by utilizing methods from statistics and empirical sciences.},
  keywords = {discovery,paradigm change,paradigm shift,scientific discovery,scientific progress,structure of scientific revolutions},
  file = {/home/michaelb/Zotero/storage/DQLA2ER2/Krauss - 2024 - Debunking revolutionary paradigm shifts evidence of cumulative scientific progress across science.pdf}
}

@article{kuhbergerPublicationBiasPsychology2014,
  title = {Publication {{Bias}} in {{Psychology}}: {{A Diagnosis Based}} on the {{Correlation}} between {{Effect Size}} and {{Sample Size}}},
  shorttitle = {Publication {{Bias}} in {{Psychology}}},
  author = {K{\"u}hberger, Anton and Fritz, Astrid and Scherndl, Thomas},
  year = {2014},
  month = sep,
  journal = {PLOS ONE},
  volume = {9},
  number = {9},
  pages = {e105825},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0105825},
  urldate = {2024-11-06},
  abstract = {Background The p value obtained from a significance test provides no information about the magnitude or importance of the underlying phenomenon. Therefore, additional reporting of effect size is often recommended. Effect sizes are theoretically independent from sample size. Yet this may not hold true empirically: non-independence could indicate publication bias. Methods We investigate whether effect size is independent from sample size in psychological research. We randomly sampled 1,000 psychological articles from all areas of psychological research. We extracted p values, effect sizes, and sample sizes of all empirical papers, and calculated the correlation between effect size and sample size, and investigated the distribution of p values. Results We found a negative correlation of r = -.45 [95\% CI: -.53; -.35] between effect size and sample size. In addition, we found an inordinately high number of p values just passing the boundary of significance. Additional data showed that neither implicit nor explicit power analysis could account for this pattern of findings. Conclusion The negative correlation between effect size and samples size, and the biased distribution of p values indicate pervasive publication bias in the entire field of psychology.},
  langid = {english},
  keywords = {Clinical psychology,Decision trees,Psychology,Publication ethics,Scientific publishing,Social psychology,Statistical data,Test statistics},
  file = {/home/michaelb/Zotero/storage/SHQZWBDE/Kühberger et al. - 2014 - Publication Bias in Psychology A Diagnosis Based on the Correlation between Effect Size and Sample.pdf}
}

@inbook{kuhn8FeatureEngineering2022,
  title = {{8 Feature Engineering with recipes}},
  booktitle = {{Tidy Modeling with R: A Framework for Modeling in the Tidyverse}},
  year = {2022},
  month = aug,
  publisher = {O'Reilly Media},
  address = {Sebastopol},
  urldate = {2025-07-23},
  collaborator = {Kuhn, Max and Silge, Julia},
  isbn = {978-1-4920-9648-1},
  langid = {Englisch}
}

@article{kuhnBuildingPredictiveModels2008,
  title = {Building Predictive Models in {{R}} Using the Caret Package},
  author = {{Kuhn} and {Max}},
  year = {2008},
  journal = {Journal of Statistical Software},
  volume = {28},
  number = {5},
  pages = {1--26},
  doi = {10.18637/jss.v028.i05}
}

@manual{kuhnFinetuneAdditionalFunctions2025,
  type = {Manual},
  title = {Finetune: {{Additional}} Functions for Model Tuning},
  author = {Kuhn, Max},
  year = {2025}
}

@manual{kuhnParsnipCommonAPI2025,
  type = {Manual},
  title = {Parsnip: A Common {{API}} to Modeling and Analysis Functions},
  author = {Kuhn, Max and Vaughan, Davis},
  year = {2025}
}

@manual{kuhnRecipesPreprocessingFeature2025,
  type = {Manual},
  title = {Recipes: {{Preprocessing}} and Feature Engineering Steps for Modeling},
  author = {Kuhn, Max and Wickham, Hadley and Hvitfeldt, Emil},
  year = {2025}
}

@inbook{kuhnReflectionsMyCritics1970,
  title = {Reflections on My {{Critics}}},
  booktitle = {Criticism and the {{Growth}} of {{Knowledge}}: {{Proceedings}} of the {{International Colloquium}} in the {{Philosophy}} of {{Science}}, {{London}}, 1965},
  author = {Kuhn, Thomas S.},
  year = {1970},
  volume = {4},
  pages = {231--278},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  doi = {10.1017/CBO9781139171434.011},
  urldate = {2024-12-13},
  abstract = {1. Introduction.2. Methodology: the role of history and sociology.3. Normal Science: its nature and functions.4. Normal Science: its retrieval from history.5. Irrationality and Theory-Choice.6. Incommensurability and Paradigms.INTRODUCTIONIt is now four years since Professor Watkins and I exchanged mutually impenetrable views at the International Colloquium in the Philosophy of Science held at Bedford College, London. Rereading our contributions together with those that have since accreted to them, I am tempted to posit the existence of two Thomas Kuhns. Kuhn is the author of this essay and of an earlier piece in this volume. He also published in 1962 a book called The Structure of Scientific Revolutions, the one which he and Miss Master-man discuss above. Kuhn is the author of another book with the same title. It is the one here cited repeatedly by Sir Karl Popper as well as by Professors Feyerabend, Lakatos, Toulmin, and Watkins. That both books bear the same title cannot be altogether accidental, for the views they present often overlap and are, in any case, expressed in the same words. But their central concerns are, I conclude, usually very different. As reported by his critics (his original has unfortunately been unavailable to me), Kuhn seems on occasion to make points that subvert essential aspects of the position outlined by his namesake.Lacking the wit to extend this introductory fantasy, I will instead explain why I have embarked upon it.},
  collaborator = {Kuhn, Thomas S.},
  isbn = {978-0-521-09623-2},
  file = {/home/michaelb/Zotero/storage/WM6P6A3L/Kuhn - 1970 - Reflections on my Critics.pdf;/home/michaelb/Zotero/storage/MDA7UI6R/7AC72C71EC97FCBB6AEFED1B78F0775B.html}
}

@book{kuhnStructureScientificRevolutions1962,
  title = {The Structure of Scientific Revolutions},
  author = {Kuhn, T. S.},
  year = {1962},
  series = {The Structure of Scientific Revolutions},
  publisher = {Chicago},
  address = {University of Chicago Press},
  abstract = {This modern classic on the philosophy of science examines the nature of scientific progress. Progress is seen as accumulative only when certain values and goals are shared; when this set of values (a paradigm) breaks down, science is seen as entering a revolutionary phase.  Harvard Book List (edited) 1971 \#37 (PsycINFO Database Record (c) 2018 APA, all rights reserved)},
  file = {/home/michaelb/Zotero/storage/G4SDNWXQ/1962-35001-000.html}
}

@book{kuhnStructureScientificRevolutions2012,
  title = {The Structure of Scientific Revolutions},
  author = {Kuhn, Thomas S. and Hacking, Ian},
  year = {2012},
  edition = {4th ed},
  publisher = {University of Chicago press},
  address = {Chicago},
  isbn = {978-0-226-45811-3 978-0-226-45812-0},
  langid = {english},
  lccn = {501}
}

@book{kuhnTidyModelingFramework2022,
  title = {{Tidy Modeling with R: A Framework for Modeling in the Tidyverse}},
  shorttitle = {{Tidy Modeling with R}},
  author = {Kuhn, Max and Silge, Julia},
  year = {2022},
  month = aug,
  publisher = {O'Reilly Media},
  address = {Sebastopol},
  urldate = {2025-07-23},
  abstract = {Get going with tidymodels, a collection of R packages for modeling and machine learning. Whether you're just starting out or have years of experience with modeling, this practical introduction shows data analysts, business analysts, and data scientists how the tidymodels framework offers a consistent, flexible approach for your work.  RStudio engineers Max Kuhn and Julia Silge demonstrate ways to create models by focusing on an R dialect called the tidyverse. Software that adopts tidyverse principles shares both a high-level design philosophy and low-level grammar and data structures, so learning one piece of the ecosystem makes it easier to learn the next. You'll understand why the tidymodels framework has been built to be used by a broad range of people.  With this book, you will: Learn the steps necessary to build a model from beginning to end Understand how to use different modeling and feature engineering approaches fluently Examine the options for avoiding common pitfalls of modeling, such as overfitting Learn practical methods to prepare your data for modeling Tune models for optimal performance Use good statistical practices to compare, evaluate, and choose among models},
  isbn = {978-1-4920-9648-1},
  langid = {Englisch}
}

@manual{kuhnTidymodelsCollectionPackages2020,
  type = {Manual},
  title = {Tidymodels: A Collection of Packages for Modeling and Machine Learning Using Tidyverse Principles.},
  author = {Kuhn, Max and Wickham, Hadley},
  year = {2020}
}

@manual{kuhnTuneTidyTuning2025,
  type = {Manual},
  title = {Tune: {{Tidy}} Tuning Tools},
  author = {Kuhn, Max},
  year = {2025}
}

@manual{kuhnYardstickTidyCharacterizations2025,
  type = {Manual},
  title = {Yardstick: {{Tidy}} Characterizations of Model Performance},
  author = {Kuhn, Max and Vaughan, Davis and Hvitfeldt, Emil},
  year = {2025}
}

@misc{kuiperHowCriminologyAffects2023,
  type = {{{SSRN Scholarly Paper}}},
  title = {How {{Criminology Affects Punishment}}: {{Analyzing Conditions Under Which Scientific Information Affects Sanction Policy Decisions}}},
  shorttitle = {How {{Criminology Affects Punishment}}},
  author = {Kuiper, Malouke Esra and Reinders Folmer, Chris and Kooistra, Emmeke Barbara and Pogarsky, Greg and {van Rooij}, Benjamin},
  year = {2023},
  month = oct,
  number = {4605853},
  eprint = {4605853},
  publisher = {Social Science Research Network},
  address = {Rochester, NY},
  doi = {10.2139/ssrn.4605853},
  urldate = {2024-11-06},
  abstract = {Criminology has a strong potential to impact criminal justice policy. It is thought that criminology fails to shape policy because of the political context of such policies. The present study analyses, however, whether criminological knowledge has the capacity to shape policy decision making in the absence of an explicit political context. We do so through a vignette study (N = 212) comparing how participants make criminal sanction policy decisions with or without reading criminological findings about the deterrent effect of longer sentences and whether this can be influenced by making harm to victims salient. The study finds that criminological science can impact policy decision making outside an explicit political context, also with salient harm to victims. Our findings show that when there is no explicit political context present, criminological evidence does affect policy making, even when there is a countervailing factor such as victim salience.  This shows that the science in of itself need not be the obstacle to better alignment with policy. The study offers a new research agenda to further generalize these results and to work towards a better incorporation of criminology in criminal justice policy.},
  archiveprefix = {Social Science Research Network},
  langid = {english},
  keywords = {criminal justice policy,criminological knowledge,decision-making,deterrence,policy makers,punishment},
  file = {/home/michaelb/Zotero/storage/YCU9T7S3/Kuiper et al. - 2023 - How Criminology Affects Punishment Analyzing Conditions Under Which Scientific Information Affects.pdf}
}

@article{lammeyUsingCrossrefMetadata2016,
  title = {Using the {{Crossref Metadata API}} to Explore Publisher Content},
  author = {Lammey, Rachael},
  year = {2016},
  month = aug,
  journal = {Science Editing},
  volume = {3},
  number = {2},
  pages = {109--111},
  publisher = {Korean Council of Science Editors},
  issn = {2288-8063, 2288-7474},
  doi = {10.6087/kcse.75},
  urldate = {2025-07-15},
  abstract = {Using the Crossref Metadata API to explore publisher content},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/59NW9ISK/Lammey - 2016 - Using the Crossref Metadata API to explore publisher content.pdf}
}

@incollection{larsenProportionalAllocationStrata2008,
  title = {Proportional {{Allocation}} to {{Strata}}},
  booktitle = {Encyclopedia of {{Survey Research Methods}}},
  author = {Larsen, Michael D.},
  year = {2008},
  pages = {630--630},
  publisher = {Sage Publications, Inc.},
  doi = {10.4135/9781412963947},
  urldate = {2025-08-03},
  abstract = {{$<$}p{$>$}To the uninformed, surveys appear to be an easy type of research to design and conduct, but when students and professionals delve deeper, they encounter the},
  isbn = {978-1-4129-6394-7},
  langid = {english}
}

@article{lattimoreReflectionsCriminalJustice2022,
  title = {Reflections on {{Criminal Justice Reform}}: {{Challenges}} and {{Opportunities}}},
  shorttitle = {Reflections on {{Criminal Justice Reform}}},
  author = {Lattimore, Pamela K.},
  year = {2022},
  month = dec,
  journal = {American Journal of Criminal Justice},
  volume = {47},
  number = {6},
  pages = {1071--1098},
  issn = {1936-1351},
  doi = {10.1007/s12103-022-09713-5},
  urldate = {2025-07-26},
  abstract = {Considerable efforts and resources have been expended to enact reforms to the criminal justice system over the last five decades. Concerns about dramatic increases in violent crime beginning in the late Sixties and accelerating into the 1980s led to the ``War on Drugs'' and the ``War on Crime'' that included implementation of more punitive policies and dramatic increases in incarceration and community supervision. More recent reform efforts have focused on strategies to reduce the negative impacts of policing, the disparate impacts of pretrial practices, and better strategies for reducing criminal behavior. Renewed interest in strategies and interventions to reduce criminal behavior has coincided with a focus on identifying ``what works.'' Recent increases in violence have shifted the national dialog from a focus on progressive reforms to reduce reliance on punitive measures and the disparate impact of the legal system on some groups to a focus on increased investment in ``tough on crime'' criminal justice approaches. This essay offers some reflections on the ``Waged Wars'' and the efforts to identify ``What Works'' based on nearly 40~years of work evaluating criminal justice reform efforts.},
  langid = {english},
  keywords = {Criminal Justice,Criminal Justice Reform,Criminology,Critical Criminology,Green Criminology,History of Criminology,Research Methods in Criminology,US Correctional,War on Crime,War on Drugs},
  file = {/home/michaelb/Zotero/storage/WS6DAT66/Lattimore - 2022 - Reflections on Criminal Justice Reform Challenges and Opportunities.pdf}
}

@article{lawrenceFreeOnlineAvailability2001,
  title = {Free Online Availability Substantially Increases a Paper's Impact},
  author = {Lawrence, Steve},
  year = {2001},
  month = may,
  journal = {Nature},
  volume = {411},
  number = {6837},
  pages = {521--521},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/35079151},
  urldate = {2024-12-13},
  copyright = {2001 Springer Nature Limited},
  langid = {english},
  keywords = {Humanities and Social Sciences,multidisciplinary,Science},
  file = {/home/michaelb/Zotero/storage/YV4RXEEH/Lawrence - 2001 - Free online availability substantially increases a paper's impact.pdf}
}

@article{leggettLifeJustSignificant2013,
  title = {The Life of p: "Just Significant" Results Are on the Rise},
  shorttitle = {The Life of p},
  author = {Leggett, Nathan C. and Thomas, Nicole A. and Loetscher, Tobias and Nicholls, Michael E. R.},
  year = {2013},
  journal = {Quarterly Journal of Experimental Psychology (2006)},
  volume = {66},
  number = {12},
  pages = {2303--2309},
  issn = {1747-0226},
  doi = {10.1080/17470218.2013.863371},
  abstract = {Null hypothesis significance testing uses the seemingly arbitrary probability of .05 as a means of objectively determining whether a tested effect is reliable. Within recent psychological articles, research has found an overrepresentation of p values around this cut-off. The present study examined whether this overrepresentation is a product of recent pressure to publish or whether it has existed throughout psychological research. Articles published in 1965 and 2005 from two prominent psychology journals were examined. Like previous research, the frequency of p values at and just below .05 was greater than expected compared to p frequencies in other ranges. While this overrepresentation was found for values published in both 1965 and 2005, it was much greater in 2005. Additionally, p values close to but over .05 were more likely to be rounded down to, or incorrectly reported as, significant in 2005 than in 1965. Modern statistical software and an increased pressure to publish may explain this pattern. The problem may be alleviated by reduced reliance on p values and increased reporting of confidence intervals and effect sizes.},
  langid = {english},
  pmid = {24205936},
  keywords = {Databases Bibliographic,Humans,Periodicals as Topic,Psychology,Publication Bias,Statistics as Topic},
  file = {/home/michaelb/Zotero/storage/8IRZ9MUW/Leggett et al. - 2013 - The life of p just significant results are on the rise.pdf}
}

@article{liawClassificationRegressionRandomForest2002,
  title = {Classification and Regression by {{randomForest}}},
  author = {Liaw, Andy and Wiener, Matthew},
  year = {2002},
  journal = {R News},
  volume = {2},
  number = {3},
  pages = {18--22}
}

@article{ListsStatisticsTopics2022,
  title = {Lists of Statistics Topics},
  year = {2022},
  month = apr,
  journal = {Wikipedia},
  urldate = {2025-07-22},
  abstract = {This article itemizes the various lists of statistics topics.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1083298628},
  file = {/home/michaelb/Zotero/storage/TEPWW8FU/index.html}
}

@article{liuOpenAccessPublications2018,
  title = {Open Access Publications in Sciences and Social Sciences: {{A}} Comparative Analysis},
  shorttitle = {Open Access Publications in Sciences and Social Sciences},
  author = {Liu, Weishu and Li, Yanchao},
  year = {2018},
  journal = {Learned Publishing},
  volume = {31},
  number = {2},
  pages = {107--119},
  issn = {1741-4857},
  doi = {10.1002/leap.1114},
  urldate = {2025-08-03},
  abstract = {In this paper, we conduct a comparative analysis to examine the characteristics and evolutionary trends of open access (OA) publications in natural and social sciences. We use data recorded by Science Citation Index Expanded, Social Sciences Citation Index, and Journal Citation Reports during 2001--2015 as the main source. We then comparatively analyse the characteristics of natural and social sciences in terms of historical evolution, main contributors, and distribution of OA journals and publications across different languages, disciplines, and impact factor quartiles. Our results suggest that both natural and social sciences experienced dramatic growth of OA journals since 2009, but the share of social science OA journals within journal impact factor quartile 1 is much lower than that of natural sciences. While natural and social sciences share some similarities in OA publishing activities, such as main countries of contribution, they differ greatly in dimensions such as OA ratio across specific disciplines, countries, and publishing languages. We acknowledge that OA publishing offers a level playing field for traditionally disadvantaged languages, countries, and scientific disciplines, but meanwhile, the advancement of high-quality OA publishing needs more targeted and sophisticated approaches to tackle differences in natural and social sciences.},
  copyright = {{\copyright} 2017 The Author(s). Learned Publishing {\copyright} 2017 ALPSP.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/TUNVATGA/Liu and Li - 2018 - Open access publications in sciences and social sciences A comparative analysis.pdf;/home/michaelb/Zotero/storage/BRSVVL9G/leap.html}
}

@article{liuQuantitativeBiasAnalysis2023,
  title = {Quantitative Bias Analysis of Prevalence under Misclassification: Evaluation Indicators, Calculation Method and Case Analysis},
  shorttitle = {Quantitative Bias Analysis of Prevalence under Misclassification},
  author = {Liu, Jin and Wang, Shiyuan and Shao, Fang},
  year = {2023},
  month = jun,
  journal = {International Journal of Epidemiology},
  volume = {52},
  number = {3},
  pages = {942--951},
  issn = {0300-5771},
  doi = {10.1093/ije/dyac239},
  urldate = {2025-09-03},
  abstract = {Prevalence estimates are fundamental to epidemiological studies. Although they are highly vulnerable to misclassification bias, the risk of bias assessment of prevalence estimates is often neglected. Quantitative bias analysis (QBA) can effectively estimate misclassification bias in epidemiological studies; however, relatively few applications are identified. One reason for its low usage is the lack of knowledge and tools for these methods among researchers. To expand existing evaluation methods, based on the QBA principles, three indicators are proposed. One is the relative bias that quantifies the bias direction through its signs and the bias magnitude through its quantity. The second is the critical point of positive test proportion in case of a misclassification bias that is equal to zero. The third is the bound of positive test proportion equal to adjusted prevalence at misclassification bias level {$\alpha$}. These indicators express the magnitude, direction and uncertainty of the misclassification bias of prevalence estimates, respectively. Using these indicators, it was found that slight oscillations of the positive test proportion within a certain range can lead to substantial increases in the misclassification bias. Hence, researchers should account for misclassification error analytically when interpreting the significance of adjusted prevalence for epidemiological decision making. This highlights the importance of applying QBA to these analyses. In this article, we have used three real-world cases to illustrate the characteristics and calculation methods of presented indicators. To facilitate application, an Excel-based calculation tool is provided.},
  file = {/home/michaelb/Zotero/storage/C2VXDBHR/Liu et al. - 2023 - Quantitative bias analysis of prevalence under misclassification evaluation indicators, calculation.pdf;/home/michaelb/Zotero/storage/LAF4EVBS/dyac239.html}
}

@article{loggPreregistrationWeighingCosts2021,
  title = {Pre-Registration: {{Weighing}} Costs and Benefits for Researchers},
  shorttitle = {Pre-Registration},
  author = {Logg, Jennifer M. and Dorison, Charles A.},
  year = {2021},
  month = nov,
  journal = {Organizational Behavior and Human Decision Processes},
  volume = {167},
  pages = {18--27},
  issn = {0749-5978},
  doi = {10.1016/j.obhdp.2021.05.006},
  urldate = {2024-11-06},
  abstract = {In the past decade, the social and behavioral sciences underwent a methodological revolution, offering practical prescriptions for improving the replicability and reproducibility of research results. One key to reforming science is a simple and scalable practice: pre-registration. Pre-registration constitutes pre-specifying an analysis plan prior to data collection. A growing chorus of articles discusses the prescriptive, field-wide benefits of pre-registration. To increase adoption, however, scientists need to know who currently pre-registers and understand perceived barriers to doing so. Thus, we weigh costs and benefits of pre-registration. Our survey of researchers reveals generational differences in who pre-registers and uncertainty regarding how pre-registration benefits individual researchers. We leverage these data to directly address researchers' uncertainty by clarifying why pre-registration improves the research process itself. Finally, we discuss how to pre-register and compare available resources. The present work examines the who, why, and how of pre-registration in order to weigh the costs and benefits of pre-registration to researchers and motivate continued adoption.},
  keywords = {Methodology,Open science,Pre-registration,Replication},
  file = {/home/michaelb/Zotero/storage/MT3PITBE/Logg and Dorison - 2021 - Pre-registration Weighing costs and benefits for researchers.pdf;/home/michaelb/Zotero/storage/4IG443GZ/S0749597821000649.html}
}

@misc{LogisticRegressionGlmnet,
  title = {Logistic Regression via Glmnet --- Details\_logistic\_reg\_glmnet},
  urldate = {2025-08-01},
  abstract = {glmnet::glmnet() fits a generalized linear model for binary outcomes. A linear combination of the predictors is used to model the log odds of an event.},
  howpublished = {https://parsnip.tidymodels.org/reference/details\_logistic\_reg\_glmnet.html},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/XKFP57XW/details_logistic_reg_glmnet.html}
}

@misc{lokottetyanaPoliticsInternetFreedom2023,
  type = {Info:Eu-Repo/Semantics/Article},
  title = {The Politics of Internet Freedom Rankings},
  author = {Lokot (Tetyana) and Wijermars (Mari{\"e}lle)},
  year = {2023},
  month = jun,
  publisher = {{Alexander von Humboldt Institute for Internet and Society gGmbH}},
  doi = {10.14763/2023.2.1710},
  urldate = {2025-07-26},
  abstract = {International rankings play an active role in defining the issue they claim to capture and giving the issue salience by presenting it as a matter of global concern. As internet access expanded globally, the past two decades have seen a rapid proliferation of indexes measuring and comparing the state of internet freedom around the globe. This article examines the politics of these rankings, e.g. Freedom House's Freedom on the Net, that have become powerful ``global pattern-setters'' for how internet freedom is understood and are used as tools of political or diplomatic influence. We adopt a relational approach to explain how and why such a complex landscape of internet freedom rankings has emerged and identify how the ranking organisations' varying approaches to capturing internet freedom have played a role in defining and legitimating it as an issue of importance. Since both the uses of the internet and discussions about defining what freedom means in relation to it have developed so rapidly, we argue that the complexity of internet freedom poses unique challenges and has required ranking organisations to continually respond to these developments, negotiating their authority in relation to other actors in their field.},
  copyright = {Creative Commons Attribution 3.0 Germany},
  howpublished = {https://policyreview.info/articles/analysis/politics-of-internet-freedom-rankings},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/G87CQFMM/Lokot (Tetyana) and Wijermars (Mariëlle) - 2023 - The politics of internet freedom rankings.pdf;/home/michaelb/Zotero/storage/8FKA4LPG/politics-of-internet-freedom-rankings.html}
}

@misc{MachineLearningGlossary,
  title = {Machine {{Learning Glossary}}},
  journal = {Google for Developers},
  urldate = {2025-07-22},
  howpublished = {https://developers.google.com/machine-learning/glossary},
  langid = {english}
}

@article{managoPreregistrationRegisteredReports2023,
  title = {Preregistration and {{Registered Reports}} in {{Sociology}}: {{Strengths}}, {{Weaknesses}}, and {{Other Considerations}}},
  shorttitle = {Preregistration and {{Registered Reports}} in {{Sociology}}},
  author = {Manago, Bianca},
  year = {2023},
  month = mar,
  journal = {The American Sociologist},
  volume = {54},
  number = {1},
  pages = {193--210},
  issn = {1936-4784},
  doi = {10.1007/s12108-023-09563-6},
  urldate = {2024-11-06},
  abstract = {Both within and outside of sociology, there are conversations about methods to reduce error and improve research quality---one such method is preregistration and its counterpart, registered reports. Preregistration is the process of detailing research questions, variables, analysis plans, etc. before conducting research. Registered reports take this one step further, with a paper being reviewed on the merit of these plans, not its findings. In this manuscript, I detail preregistration's and registered reports' strengths and weaknesses for improving the quality of sociological research. I conclude by considering the implications of a structural-level adoption of preregistration and registered reports. Importantly, I do not recommend that all sociologists use preregistration and registered reports for all studies. Rather, I discuss the potential benefits and genuine limitations of preregistration and registered reports for the individual sociologist and the discipline.},
  langid = {english},
  keywords = {Open science,Preregistration,Registered reports,Reproducibility,Transparency},
  file = {/home/michaelb/Zotero/storage/BGQB764K/Manago - 2023 - Preregistration and Registered Reports in Sociology Strengths, Weaknesses, and Other Considerations.pdf}
}

@article{markowitzTracingAdoptionEffects2021,
  title = {Tracing the {{Adoption}} and {{Effects}} of {{Open Science}} in {{Communication Research}}*},
  author = {Markowitz, David M and Song, Hyunjin and Taylor, Samuel Hardman},
  year = {2021},
  month = oct,
  journal = {Journal of Communication},
  volume = {71},
  number = {5},
  pages = {739--763},
  issn = {0021-9916},
  doi = {10.1093/joc/jqab030},
  urldate = {2024-11-06},
  abstract = {A significant paradigm shift is underway in communication research as open science practices (e.g., preregistration, open materials) are becoming more prevalent. The current work identified how much the field has embraced such practices and evaluated their impact on authors (e.g., citation rates). We collected 10,517 papers across 26 journals from 2010 to 2020, observing that 5.1\% of papers used or mentioned open science practices. Communication research has seen the rate of nonsignificant p-values (p \&gt; .055) increasing with the adoption of open science over time, but p-values just below p \&lt; .05 have not reduced with open science adoption. Open science adoption was unrelated to citation rate at the article level; however, it was inversely related to the journals' h-index. Our results suggest communication organizations and scholars have important work ahead to make open science more mainstream. We close with suggestions to increase open science adoption for the field at large.},
  file = {/home/michaelb/Zotero/storage/WBKICQTZ/Markowitz et al. - 2021 - Tracing the Adoption and Effects of Open Science in Communication Research.pdf;/home/michaelb/Zotero/storage/KV8S4HXI/6354844.html}
}

@article{martinez-plumedCRISPDMTwentyYears2021,
  title = {{{CRISP-DM Twenty Years Later}}: {{From Data Mining Processes}} to {{Data Science Trajectories}}},
  shorttitle = {{{CRISP-DM Twenty Years Later}}},
  author = {{Mart{\'i}nez-Plumed}, Fernando and {Contreras-Ochando}, Lidia and Ferri, C{\`e}sar and {Hern{\'a}ndez-Orallo}, Jos{\'e} and Kull, Meelis and Lachiche, Nicolas and {Ram{\'i}rez-Quintana}, Mar{\'i}a Jos{\'e} and Flach, Peter},
  year = {2021},
  month = aug,
  journal = {IEEE Transactions on Knowledge and Data Engineering},
  volume = {33},
  number = {8},
  pages = {3048--3061},
  issn = {1558-2191},
  doi = {10.1109/TKDE.2019.2962680},
  urldate = {2025-08-02},
  abstract = {CRISP-DM(CRoss-Industry Standard Process for Data Mining) has its origins in the second half of the nineties and is thus about two decades old. According to many surveys and user polls it is still the de facto standard for developing data mining and knowledge discovery projects. However, undoubtedly the field has moved on considerably in twenty years, with data science now the leading term being favoured over data mining. In this paper we investigate whether, and in what contexts, CRISP-DM is still fit for purpose for data science projects. We argue that if the project is goal-directed and process-driven the process model view still largely holds. On the other hand, when data science projects become more exploratory the paths that the project can take become more varied, and a more flexible model is called for. We suggest what the outlines of such a trajectory-based model might look like and how it can be used to categorise data science projects (goal-directed, exploratory or data management). We examine seven real-life exemplars where exploratory activities play an important role and compare them against 51 use cases extracted from the NIST Big Data Public Working Group. We anticipate this categorisation can help project planning in terms of time and cost characteristics.},
  keywords = {Business,data mining,Data mining,Data models,Data science,Data science trajectories,data-driven methodologies,Knowledge discovery,knowledge discovery process,Standards,Trajectory},
  file = {/home/michaelb/Zotero/storage/DIGLGEHL/Martínez-Plumed et al. - 2021 - CRISP-DM Twenty Years Later From Data Mining Processes to Data Science Trajectories.pdf}
}

@article{marvellIndirectSimultaneity2019,
  title = {Indirect Simultaneity},
  author = {Marvell, Thomas B.},
  year = {2019},
  journal = {Criminology \& Public Policy},
  volume = {18},
  number = {1},
  pages = {201--206},
  issn = {1745-9133},
  doi = {10.1111/1745-9133.12432},
  urldate = {2025-08-07},
  copyright = {{\copyright} 2019 American Society of Criminology},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/LWQVNHWH/Marvell - 2019 - Indirect simultaneity.pdf;/home/michaelb/Zotero/storage/SY5R9IFP/1745-9133.html}
}

@article{matsubayashiStatusOpenAccess2009,
  title = {Status of Open Access in the Biomedical Field in 2005},
  author = {Matsubayashi, Mamiko and Kurata, Keiko and Sakai, Yukiko and Morioka, Tomoko and Kato, Shinya and Mine, Shinji and Ueda, Shuichi},
  year = {2009},
  month = jan,
  journal = {Journal of the Medical Library Association : JMLA},
  volume = {97},
  number = {1},
  pages = {4--11},
  issn = {1536-5050},
  doi = {10.3163/1536-5050.97.1.002},
  urldate = {2025-08-28},
  abstract = {Objectives: This study was designed to document the state of open access (OA) in the biomedical field in 2005. Methods: PubMed was used to collect bibliographic data on target articles published in 2005. PubMed, Google Scholar, Google, and OAIster were then used to establish the availability of free full text online for these publications. Articles were analyzed by type of OA, country, type of article, impact factor, publisher, and publishing model to provide insight into the current state of OA. Results: Twenty-seven percent of all the articles were accessible as OA articles. More than 70\% of the OA articles were provided through journal websites. Mid-rank commercial publishers often provided OA articles in OA journals, while society publishers tended to provide OA articles in the context of a traditional subscription model. The rate of OA articles available from the websites of individual authors or in institutional repositories was quite low. Discussion/Conclusions: In 2005, OA in the biomedical field was achieved under an umbrella of existing scholarly communication systems. Typically, OA articles were published as part of subscription journals published by scholarly societies. OA journals published by BioMed Central contributed to a small portion of all OA articles.},
  pmcid = {PMC2605039},
  pmid = {19159007},
  file = {/home/michaelb/Zotero/storage/FVDIBHQC/Matsubayashi et al. - 2009 - Status of open access in the biomedical field in 2005.pdf}
}

@article{matternWhyAcademicsUndershare2024,
  title = {Why Academics Under-Share Research Data: {{A}} Social Relational Theory},
  shorttitle = {Why Academics Under-Share Research Data},
  author = {Mattern, Janice Bially and Kohlburn, Joseph and {Moulaison-Sandy}, Heather},
  year = {2024},
  journal = {Journal of the Association for Information Science and Technology},
  volume = {75},
  number = {9},
  pages = {988--1001},
  issn = {2330-1643},
  doi = {10.1002/asi.24938},
  urldate = {2024-12-15},
  abstract = {Despite their professed enthusiasm for open science, faculty researchers have been documented as not freely sharing their data; instead, if sharing data at all, they take a minimal approach. A robust research agenda in LIS has documented the data under-sharing practices in which they engage, and the motivations they profess. Using theoretical frameworks from sociology to complement research in LIS, this article examines the broader context in which researchers are situated, theorizing the social relational dynamics in academia that influence faculty decisions and practices relating to data sharing. We advance a theory that suggests that the academy has entered a period of transition, and faculty resistance to data sharing through foot-dragging is one response to shifting power dynamics. If the theory is borne out empirically, proponents of open access will need to find a way to encourage open academic research practices without undermining the social value of academic researchers.},
  copyright = {{\copyright} 2024 The Author(s). Journal of the Association for Information Science and Technology published by Wiley Periodicals LLC on behalf of Association for Information Science and Technology.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/63BNN3ZN/Mattern et al. - 2024 - Why academics under-share research data A social relational theory.pdf;/home/michaelb/Zotero/storage/VKRAEZB8/asi.html}
}

@article{maxwellPsychologySufferingReplication2015,
  title = {Is Psychology Suffering from a Replication Crisis?: {{What}} Does 'failure to Replicate' Really Mean?},
  shorttitle = {Is Psychology Suffering from a Replication Crisis?},
  author = {Maxwell, Scott E. and Lau, Michael Y. and Howard, George S.},
  year = {2015},
  journal = {American Psychologist},
  volume = {70},
  number = {6},
  pages = {487--498},
  publisher = {American Psychological Association Inc.},
  issn = {0003-066X},
  doi = {10.1037/a0039400},
  abstract = {Psychology has recently been viewed as facing a replication crisis because efforts to replicate past study findings frequently do not show the same result. Often, the first study showed a statistically significant result but the replication does not. Questions then arise about whether the first study results were false positives, and whether the replication study correctly indicates that there is truly no effect after all. This article suggests these so-called failures to replicate may not be failures at all, but rather are the result of low statistical power in single replication studies, and the result of failure to appreciate the need for multiple replications in order to have enough power to identify true effects. We provide examples of these power problems and suggest some solutions using Bayesian statistics and metaanalysis. Although the need for multiple replication studies may frustrate those who would prefer quick answers to psychology's alleged crisis, the large sample sizes typically needed to provide firm evidence will almost always require concerted efforts from multiple investigators. As a result, it remains to be seen how many of the recently claimed failures to replicate will be supported or instead may turn out to be artifacts of inadequate sample sizes and single study replications. {\copyright} 2015 American Psychological Association.},
  langid = {english},
  keywords = {Bayesian methods,Equivalence tests,False positive results,Metaanalysis,Statistical power},
  file = {/home/michaelb/Zotero/storage/HGWEUNHG/Maxwell et al. - 2015 - Is psychology suffering from a replication crisis What does 'failure to replicate' really mean.pdf;/home/michaelb/Zotero/storage/VNRLBAB8/84941060169.html}
}

@misc{mcancel7MetadataDepositSchema,
  type = {Website},
  title = {Metadata Deposit Schema 5.4.0},
  author = {Mcancel7},
  journal = {Crossref},
  urldate = {2025-07-14},
  abstract = {Beginning with deposit schema version 4.4.2, all Crossref schema releases are available in our GitLab schema repository as a bundle. Bundle 0.3.3 contains schema version 5.4.0 and associated files. Schema: crossref5.4.0.xsd Full documentation: 5.4.0 Crossref included schema: common5.4.0.xsd fundref.xsd AccessIndicators.xsd clinicaltrials.xsd relations.xsd languages5.4.0.xsd mediatypes5.4.0.xsd External imported schema: MathML JATS Changes from 5.3.1 type attribute added to citations element to allow citations to be labeled with a specific citation type (journal article, dataset, etc.) version number support added to journal article, content item (book chapter/segment, report chapter/segment, and standard chapter/segment), proceedings paper, posted content, database / dataset, report, standard, and dissertation status added to posted content record types number of allowed ISBN increased to 100 @language attribute values have been expanded to include ISO 639-2 values and moved to languages5.4.0.xsd `similarity-check' has been added as a crawler value to support Similarity Check crawling. The list of supported media types (labeled mime types in our schema) have been expanded and moved to mediatypes5.4.0.xsd The language values associated with the `language' attribute have been expanded to include ISO 639-2. The `language' attribute assigned to the `relations' xsd has not yet been updated, this will be included in a later update.},
  copyright = {CC BY 4.0},
  howpublished = {https://www.crossref.org/documentation/schema-library/metadata-deposit-schema-5-4-0/},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/2IYUHFGA/metadata-deposit-schema-5-4-0.html}
}

@article{mccarleyPsychometricCurvesReveal2021,
  title = {Psychometric {{Curves Reveal Three Mechanisms}} of {{Vigilance Decrement}}},
  author = {McCarley, Jason S. and Yamani, Yusuke},
  year = {2021},
  month = oct,
  journal = {Psychological Science},
  volume = {32},
  number = {10},
  pages = {1675--1683},
  publisher = {SAGE Publications Inc},
  issn = {0956-7976},
  doi = {10.1177/09567976211007559},
  urldate = {2025-08-07},
  abstract = {The vigilance decrement is a decline in signal detection rate that occurs over time on a sustained-attention task. The effect has typically been ascribed to conservative shifts of response bias and losses of perceptual sensitivity. Recent work, though, has suggested that sensitivity losses in vigilance tasks are spurious, and other findings have implied that attentional lapses contribute to vigilance failures. To test these possibilities, we used Bayesian hierarchical modeling to compare psychometric curves for the first and last blocks of a visual vigilance task. Participants were a convenience sample of 99 young adults. Data showed evidence for all three postulated mechanisms of vigilance loss: a conservative shift of response bias, a decrease in perceptual sensitivity, and a tendency toward more frequent attentional lapses. Results confirm that sensitivity losses are possible in a sustained-attention task but indicate that mental lapses can also contribute to the vigilance decrement.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/T3BYHUI5/McCarley and Yamani - 2021 - Psychometric Curves Reveal Three Mechanisms of Vigilance Decrement.pdf}
}

@article{mccarthyNewEconomicsSociological2002,
  title = {New {{Economics}} of {{Sociological Criminology}}},
  author = {McCarthy, Bill},
  year = {2002},
  month = aug,
  journal = {Annual Review of Sociology},
  volume = {28},
  number = {Volume 28, 2002},
  pages = {417--442},
  publisher = {Annual Reviews},
  issn = {0360-0572, 1545-2115},
  doi = {10.1146/annurev.soc.28.110601.140752},
  urldate = {2025-07-22},
  abstract = {▪ Abstract This paper begins with a summary of the rational choice approach and its implications for the study of criminal behavior. I then review research on offending that uses the rational choice approach in conjunction with more sociological orientations. I also summarize research on game theory and demonstrate how it can be effectively used to understand and predict criminal decision-making. I argue that, contrary to the assessment of many criminologists, rational choice approach and game theory insights can be combined profitably with sociological perspectives to advance the understanding and prediction of criminal behavior.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/RQVT79NN/annurev.soc.28.110601.html}
}

@article{mcneeleyReplicationCriminologyNecessary2015,
  title = {Replication in Criminology: {{A}} Necessary Practice},
  shorttitle = {Replication in Criminology},
  author = {McNeeley, Susan and Warner, Jessica J.},
  year = {2015},
  month = sep,
  journal = {European Journal of Criminology},
  volume = {12},
  number = {5},
  pages = {581--597},
  publisher = {SAGE Publications},
  issn = {1477-3708},
  doi = {10.1177/1477370815578197},
  urldate = {2025-08-29},
  abstract = {Although researchers acknowledge the importance of replication in building scientific knowledge, replication studies seem to be published infrequently. The present study examines the extent to which replications are conducted in criminology. We conduct a content analysis of the five most influential journals in criminology. We also compare the replication rate in criminology with that in the social sciences and natural sciences. The results show that replication research is rarely published in these disciplines. In criminology journals in particular, replication studies constitute just over 2 percent of the articles published between 2006 and 2010. Further, those replication studies that were published in criminology journals in that period tended to conflict with the original studies. These findings call into question the utility of empirical results published in criminology journals for developing theory and policy. Strategies for promoting replication research in criminology are suggested.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/7X398HTH/McNeeley and Warner - 2015 - Replication in criminology A necessary practice.pdf}
}

@book{menardAppliedLogisticRegression2002,
  title = {Applied {{Logistic Regression Analysis}}},
  author = {Menard, Scott},
  year = {2002},
  publisher = {SAGE Publications, Inc.},
  doi = {10.4135/9781412983433},
  urldate = {2025-08-01},
  abstract = {{$<$}p{$>$}The focus in Applied Logistic Regression Analysis, Second Edition, is again on logistic regression models for individual level data, but aggregate or grouped},
  isbn = {978-1-4129-8343-3},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/4NN97WXE/Menard - 2002 - Applied Logistic Regression Analysis.pdf;/home/michaelb/Zotero/storage/5QH2GI4Z/Applied Logistic Regression Analysis (Quantitative -- Dr_ Scott Menard -- Quantitative applications in the social sciences, v_ 106, -- SAGE -- 9780761922087 -- 2bdd25f639f9079531d6f53eb6722df8 -- Anna’s Ar.pdf}
}

@inbook{menardLinearRegressionLogistic2002,
  title = {Linear {{Regression}} and the {{Logistic Regression Model}}},
  booktitle = {Applied {{Logistic Regression Analysis}}},
  year = {2002},
  publisher = {SAGE Publications, Inc.},
  doi = {10.4135/9781412983433},
  urldate = {2025-08-01},
  collaborator = {Menard, Scott},
  isbn = {978-1-4129-8343-3},
  langid = {english}
}

@article{mertensPreregistrationAnalysesPreexisting2019,
  title = {Preregistration of {{Analyses}} of {{Preexisting Data}}},
  author = {Mertens, Ga{\"e}tan and Krypotos, Angelos-Miltiadis},
  year = {2019},
  month = aug,
  journal = {Psychologica Belgica},
  volume = {59},
  number = {1},
  issn = {0033-2879},
  doi = {10.5334/pb.493},
  urldate = {2024-11-06},
  abstract = {Psychologica Belgica is the official journal of the Belgian Association for Psychological Sciences (BAPS). BAPS promotes the development of psychological sciences in Belgium, at both fundamental and applied research levels. The journal ensures rigorous peer-review to maintain research integrity. Psychological Belgica makes publications available online as soon as they are finalised. All publications are open access, making research available free of charge and without delay. The journal has a 2022 Impact Factor of 2.0 and a 5 year impact factor of 2.1. Subscribe to content alerts and other journal news here. You can also follow the journal on ResearchGate.},
  langid = {american},
  file = {/home/michaelb/Zotero/storage/GZRSX45A/Mertens and Krypotos - 2019 - Preregistration of Analyses of Preexisting Data.pdf}
}

@misc{MIADeutschBibliothek,
  title = {{{MIA Deutsch}} - {{Bibliothek}}},
  urldate = {2025-08-30},
  howpublished = {https://www.marxists.org/deutsch/archiv/index.htm},
  file = {/home/michaelb/Zotero/storage/KZ8JJHBE/index.html}
}

@manual{milborrowRpartplotPlotRpart2025,
  type = {Manual},
  title = {Rpart.Plot: {{Plot}} 'rpart' Models: {{An}} Enhanced Version of 'Plot.Rpart'},
  author = {Milborrow, Stephen},
  year = {2025}
}

@manual{milborrowRpartplotPlotRpart2025a,
  type = {Manual},
  title = {Rpart.Plot: {{Plot}} 'rpart' Models: {{An}} Enhanced Version of 'Plot.Rpart'},
  author = {Milborrow, Stephen},
  year = {2025},
  doi = {10.32614/CRAN.package.rpart.plot}
}

@article{mironLastBiteBIT2014,
  title = {The {{Last Bite}} of the {{{\textsc{BIT}}}} s---{{Supremacy}} of {{{\textsc{EU}}}} {{Law}} versus {{Investment Treaty Arbitration}}},
  shorttitle = {The {{Last Bite}} of The},
  author = {Miron, Smaranda},
  year = {2014},
  month = may,
  journal = {European Law Journal},
  volume = {20},
  number = {3},
  pages = {332--345},
  issn = {1351-5993, 1468-0386},
  doi = {10.1111/eulj.12039},
  urldate = {2025-08-05},
  abstract = {Abstract                            According to Article 267               TFEU               , national courts of the               EU M               ember               S               tates can (and sometimes must) ask for a preliminary ruling from the               C               ourt of               J               ustice on the interpretation and application of               C               ommunity law, including international treaties and recommendations, and on the validity of               C               ommunity secondary legislation. In this way, it is ensured that               EU               citizens are treated equally throughout the               U               nion. However, this is not applicable when it comes to arbitral proceedings, be they commercial or investment arbitrations. The               C               ourt does not accept references for preliminary rulings from arbitral tribunals. For this reason, respondent states in international arbitral proceedings have argued that arbitration and               EU               law are utterly incompatible. In their submissions as respondents in arbitral proceedings,               EU M               ember               S               tates have argued that, as a result of               EU               accession, bilateral investment treaties (               BIT               s) have been automatically terminated. In subsidiary, they sometimes claim that, due to their incompatibility with               EU               law,               BIT               s cannot apply. But if               BIT               s are not applicable anymore, there are few remedies left for investors within the               EU               .},
  copyright = {http://onlinelibrary.wiley.com/termsAndConditions\#vor},
  langid = {english}
}

@misc{MischbeckOpenScienceReviewOpenScienceReview,
  title = {Mischbeck/{{OpenScienceReview}} - {{OpenScienceReview}} - {{Gitea}}},
  urldate = {2024-12-19},
  howpublished = {https://git.mischbeck.de/mischbeck/OpenScienceReview},
  file = {/home/michaelb/Zotero/storage/D9M82ZBY/OpenScienceReview.html}
}

@misc{moneva2025attitudes,
  title = {Attitudes and Barriers to Open Science Practices: A Mixed-Methods Analysis at a Criminological Research Institute},
  author = {Moneva, Asier and Bernasco, Wim and {van de Weijer}, Steve and V{\"o}lker, Beate and Nivette, Amy},
  year = {2025},
  publisher = {{Netherlands Institute for the Study of Crime and Law Enforcement (NSCR)}},
  address = {De Boelelaan 1077, 1081 HV Amsterdam, The Netherlands},
  archiveprefix = {Netherlands Institute for the Study of Crime and Law Enforcement (NSCR)},
  howpublished = {Preprint},
  keywords = {attitudes,challenges,criminology,institutional change,open science,opportunities},
  file = {/home/michaelb/Zotero/storage/F7VX6MBU/Moneva et al. - 2025 - Attitudes and barriers to open science practices a mixed-methods analysis at a criminological resea.pdf}
}

@inproceedings{moreReviewRandomForest2017,
  title = {Review of Random Forest Classification Techniques to Resolve Data Imbalance},
  booktitle = {2017 1st {{International Conference}} on {{Intelligent Systems}} and {{Information Management}} ({{ICISIM}})},
  author = {More, A. S. and Rana, Dipti P.},
  year = {2017},
  month = oct,
  pages = {72--78},
  doi = {10.1109/ICISIM.2017.8122151},
  urldate = {2025-08-22},
  abstract = {In this current age, numerous ranges of real word applications with imbalanced dataset is one of the foremost focal point of researcher's inattention. There is the enormous increment of data generation and imbalance within dataset. Processing and knowledge extraction of huge amount of imbalanced data becomes a challenge related with space and time necessities. Generally there is a list of an assortment of factual humanity applications which deals with unequal data sample division in to number of classes. Due to this division of data either of class goes into majority or minority with comparably less data count. This outnumbering of data sample in either of one class directs towards the handling of minority class and target on remarkable reduction in error rate. The standard learning methods do not directly focus on this type of classes. Random Forest Classification (RFC) is an ensemble approach that utilizes a number of classifiers to work together in order to identify the class label for unlabeled instances. This approach has proved its high accuracy and superiority with imbalanced datasets. This classifier provides various techniques to resolve class imbalance problem. This paper summarizes, the literature survey from 2000 to 2016 of various techniques related to RFC to resolve class imbalance. Specifically Weighted Random Forest (WRF), Balanced Random Forest (BRF), Sampling (Under Sampling (US)) and Down Sampling (DS), Cost Sensitive Methods have been adapted more to till date. The limitation of this numerous literature is researchers can focus on dynamic integration techniques to resolve class imbalance and increase robustness and versatility of classification.},
  keywords = {Balanced Random Forest,Bibliographies,Decision trees,Dynamic Integration Technique,Noise measurement,Random Forest Classification,Sampling,Sampling methods,Support vector machines,Training,Training data,Weighted Random Forest},
  file = {/home/michaelb/Zotero/storage/IVJA2VYY/8122151.html}
}

@article{munafoManifestoReproducibleScience2017,
  title = {A Manifesto for Reproducible Science},
  author = {Munaf{\`o}, Marcus R. and Nosek, Brian A. and Bishop, Dorothy V. M. and Button, Katherine S. and Chambers, Christopher D. and {Percie du Sert}, Nathalie and Simonsohn, Uri and Wagenmakers, Eric-Jan and Ware, Jennifer J. and Ioannidis, John P. A.},
  year = {2017},
  month = jan,
  journal = {Nature Human Behaviour},
  volume = {1},
  number = {1},
  pages = {0021},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-016-0021},
  urldate = {2025-08-28},
  abstract = {Improving the reliability and efficiency of scientific research will increase the credibility of the published scientific literature and accelerate discovery. Here we argue for the adoption of measures to optimize key elements of the scientific process: methods, reporting and dissemination, reproducibility, evaluation and incentives. There is some evidence from both simulations and empirical studies supporting the likely effectiveness of these measures, but their broad adoption by researchers, institutions, funders and journals will require iterative evaluation and improvement. We discuss the goals of these measures, and how they can be implemented, in the hope that this will facilitate action toward improving the transparency, reproducibility and efficiency of scientific research.},
  copyright = {2017 Macmillan Publishers Limited},
  langid = {english},
  keywords = {Social sciences},
  file = {/home/michaelb/Zotero/storage/A37QJREV/Munafò et al. - 2017 - A manifesto for reproducible science.pdf}
}

@inbook{murphy5BayesianStatistics2012,
  title = {5 {{Bayesian Statistics}}},
  booktitle = {Machine Learning: A Probabilistic Perspective},
  author = {Murphy, Kevin P.},
  year = {2012},
  series = {Adaptive Computation and Machine Learning Series},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  collaborator = {Murphy, Kevin P.},
  isbn = {978-0-262-01802-9},
  lccn = {Q325.5 .M87 2012},
  keywords = {Machine learning,Probabilities}
}

@inbook{murphyGenerativeModelsDiscrete2012,
  title = {Generative Models for Discrete Data},
  booktitle = {Machine Learning: A Probabilistic Perspective},
  year = {2012},
  series = {Adaptive Computation and Machine Learning Series},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  collaborator = {Murphy, Kevin P.},
  isbn = {978-0-262-01802-9},
  lccn = {Q325.5 .M87 2012},
  keywords = {Machine learning,Probabilities}
}

@book{murphyMachineLearningProbabilistic2012,
  title = {Machine Learning: A Probabilistic Perspective},
  shorttitle = {Machine Learning},
  author = {Murphy, Kevin P.},
  year = {2012},
  series = {Adaptive Computation and Machine Learning Series},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  isbn = {978-0-262-01802-9},
  lccn = {Q325.5 .M87 2012},
  keywords = {Machine learning,Probabilities},
  file = {/home/michaelb/Zotero/storage/7U3JHCH4/Murphy - 2012 - Machine learning a probabilistic perspective.pdf}
}

@inproceedings{naiduReviewEvaluationMetrics2023,
  title = {A {{Review}} of {{Evaluation Metrics}} in {{Machine Learning Algorithms}}},
  booktitle = {Artificial {{Intelligence Application}} in {{Networks}} and {{Systems}}},
  author = {Naidu, Gireen and Zuva, Tranos and Sibanda, Elias Mmbongeni},
  editor = {Silhavy, Radek and Silhavy, Petr},
  year = {2023},
  pages = {15--25},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-031-35314-7_2},
  abstract = {With the increase in the adoption rate of machine learning algorithms in multiple sectors, the need for accurate measurement and assessment is imperative, especially when classifiers are applied to real world applications. Determining which are the most appropriate evaluation metrics to effectively assess and evaluate the performance of a binary, multi-class and multi-labelled classifier needs to be further understood. Another significant challenge impacting research is that results from models that are similar in nature cannot be adequately compared if the criteria for the measurement and evaluation of these models are not standardized. This review paper aims at highlighting the various evaluation metrics being applied in research and the non-standardization of evaluation metrics to measure the classification results of the model. Although Accuracy, Precision, Recall and F1-Score are the most applied evaluation metrics, there are certain limitations when considering these metrics in isolation. Other metrics such as ROC{\textbackslash}AUC and Kappa statistics have proven to provide additional insightful into the effectiveness of an algorithms adequacy and should also be considered when evaluating the effectiveness of binary, multi-class and multi-labelled classifiers. The adoption of a standardized and consistent evaluation methodology should be explored as an area of future work.},
  isbn = {978-3-031-35314-7},
  langid = {english},
  keywords = {accuracy,AUC,Evaluation metrics,machine learning,ROC},
  file = {/home/michaelb/Zotero/storage/5HIWIX5Y/Naidu et al. - 2023 - A Review of Evaluation Metrics in Machine Learning Algorithms.pdf}
}

@misc{NaturalLanguageProcessing,
  title = {Natural {{Language Processing}} in {{Mixed-methods Text Analysis}}: {{A Workflow Approach}} - {{Google Suche}}},
  urldate = {2025-04-01},
  howpublished = {https://www.google.com/search?client=firefox-b-d\&channel=entpr\&q=Natural+Language+Processing+in+Mixed-methods+Text+Analysis\%3A+A+Workflow+Approach},
  file = {/home/michaelb/Zotero/storage/JUES2G8X/search.html}
}

@article{nembriniRevivalGiniImportance2018,
  title = {The Revival of the {{Gini}} Importance?},
  author = {Nembrini, Stefano and K{\"o}nig, Inke R and Wright, Marvin N},
  year = {2018},
  month = nov,
  journal = {Bioinformatics},
  volume = {34},
  number = {21},
  pages = {3711--3718},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/bty373},
  urldate = {2025-08-22},
  abstract = {Random forests are fast, flexible and represent a robust approach to analyze high dimensional data. A key advantage over alternative machine learning algorithms are variable importance measures, which can be used to identify relevant features or perform variable selection. Measures based on the impurity reduction of splits, such as the Gini importance, are popular because they are simple and fast to compute. However, they are biased in favor of variables with many possible split points and high minor allele frequency.We set up a fast approach to debias impurity-based variable importance measures for classification, regression and survival forests. We show that it creates a variable importance measure which is unbiased with regard to the number of categories and minor allele frequency and almost as fast as the standard impurity importance. As a result, it is now possible to compute reliable importance estimates without the extra computing cost of permutations. Further, we combine the importance measure with a fast testing procedure, producing p-values for variable importance with almost no computational overhead to the creation of the random forest. Applications to gene expression and genome-wide association data show that the proposed method is powerful and computationally efficient.The procedure is included in the ranger package, available at https://cran.r-project.org/package=ranger and https://github.com/imbs-hl/ranger.Supplementary data are available at Bioinformatics online.},
  file = {/home/michaelb/Zotero/storage/Y2U5ITK3/Nembrini et al. - 2018 - The revival of the Gini importance.pdf;/home/michaelb/Zotero/storage/SA4W3NQK/bty373.html}
}

@article{nororiAddressingBiasBig2021,
  title = {Addressing Bias in Big Data and {{AI}} for Health Care: {{A}} Call for Open Science},
  shorttitle = {Addressing Bias in Big Data and {{AI}} for Health Care},
  author = {Norori, Natalia and Hu, Qiyang and Aellen, Florence Marcelle and Faraci, Francesca Dalia and Tzovara, Athina},
  year = {2021},
  month = oct,
  journal = {Patterns},
  volume = {2},
  number = {10},
  pages = {100347},
  issn = {2666-3899},
  doi = {10.1016/j.patter.2021.100347},
  urldate = {2024-12-13},
  abstract = {Artificial intelligence (AI) has an astonishing potential in assisting clinical decision making and revolutionizing the field of health care. A major open challenge that AI will need to address before its integration in the clinical routine is that of algorithmic bias. Most AI algorithms need big datasets to learn from, but several groups of the human population have a long history of being absent or misrepresented in existing biomedical datasets. If the training data is misrepresentative of the population variability, AI is prone to reinforcing bias, which can lead to fatal outcomes, misdiagnoses, and lack of generalization. Here, we describe the challenges in rendering AI algorithms fairer, and we propose concrete steps for addressing bias using tools from the field of open science.},
  keywords = {artificial intelligence,bias,data standards,deep learning,health care,open science,participatory science},
  file = {/home/michaelb/Zotero/storage/38I8DX6G/Norori et al. - 2021 - Addressing bias in big data and AI for health care A call for open science.pdf;/home/michaelb/Zotero/storage/KHTDQ2SW/S2666389921002026.html}
}

@article{nosekPreregistrationRevolution2018a,
  title = {The Preregistration Revolution},
  author = {Nosek, Brian A. and Ebersole, Charles R. and DeHaven, Alexander C. and Mellor, David T.},
  year = {2018},
  month = mar,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {115},
  number = {11},
  pages = {2600--2606},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.1708274114},
  urldate = {2025-08-26},
  abstract = {Progress in science relies in part on generating hypotheses with existing observations and testing hypotheses with new observations. This distinction between postdiction and prediction is appreciated conceptually but is not respected in practice. Mistaking generation of postdictions with testing of predictions reduces the credibility of research findings. However, ordinary biases in human reasoning, such as hindsight bias, make it hard to avoid this mistake. An effective solution is to define the research questions and analysis plan before observing the research outcomes---a process called preregistration. Preregistration distinguishes analyses and outcomes that result from predictions from those that result from postdictions. A variety of practical strategies are available to make the best possible use of preregistration in circumstances that fall short of the ideal application, such as when the data are preexisting. Services are now available for preregistration across all disciplines, facilitating a rapid increase in the practice. Widespread adoption of preregistration will increase distinctiveness between hypothesis generation and hypothesis testing and will improve the credibility of research findings.},
  file = {/home/michaelb/Zotero/storage/NVKDDNWI/Nosek et al. - 2018 - The preregistration revolution.pdf}
}

@article{nosekPromotingOpenResearch2015,
  title = {Promoting an Open Research Culture},
  author = {Nosek, B. A. and Alter, G. and Banks, G. C. and Borsboom, D. and Bowman, S. D. and Breckler, S. J. and Buck, S. and Chambers, C. D. and Chin, G. and Christensen, G. and Contestabile, M. and Dafoe, A. and Eich, E. and Freese, J. and Glennerster, R. and Goroff, D. and Green, D. P. and Hesse, B. and Humphreys, M. and Ishiyama, J. and Karlan, D. and Kraut, A. and Lupia, A. and Mabry, P. and Madon, T. and Malhotra, N. and {Mayo-Wilson}, E. and McNutt, M. and Miguel, E. and Paluck, E. Levy and Simonsohn, U. and Soderberg, C. and Spellman, B. A. and Turitto, J. and VandenBos, G. and Vazire, S. and Wagenmakers, E. J. and Wilson, R. and Yarkoni, T.},
  year = {2015},
  month = jun,
  journal = {Science},
  volume = {348},
  number = {6242},
  pages = {1422--1425},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.aab2374},
  urldate = {2024-12-18},
  file = {/home/michaelb/Zotero/storage/A32SAIJU/Nosek et al. - 2015 - Promoting an open research culture.pdf}
}

@article{nosekRegisteredReports2014,
  title = {Registered {{Reports}}},
  author = {Nosek, Brian A. and Lakens, Dani{\"e}l},
  year = {2014},
  month = may,
  journal = {Social Psychology},
  volume = {45},
  number = {3},
  pages = {137--141},
  publisher = {Hogrefe Publishing},
  issn = {1864-9335},
  doi = {10.1027/1864-9335/a000192},
  urldate = {2024-12-16},
  file = {/home/michaelb/Zotero/storage/DYGVTARA/Nosek and Lakens - 2014 - Registered Reports.pdf}
}

@article{noyExperimentalEvidenceProductivity2023,
  title = {Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence},
  author = {Noy, Shakked and Zhang, Whitney},
  year = {2023},
  month = jul,
  journal = {Science},
  volume = {381},
  number = {6654},
  pages = {187--192},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.adh2586},
  urldate = {2025-03-20},
  abstract = {We examined the productivity effects of a generative artificial intelligence (AI) technology, the assistive chatbot ChatGPT, in the context of midlevel professional writing tasks. In a preregistered online experiment, we assigned occupation-specific, incentivized writing tasks to 453 college-educated professionals and randomly exposed half of them to ChatGPT. Our results show that ChatGPT substantially raised productivity: The average time taken decreased by 40\% and output quality rose by 18\%. Inequality between workers decreased, and concern and excitement about AI temporarily rose. Workers exposed to ChatGPT during the experiment were 2 times as likely to report using it in their real job 2 weeks after the experiment and 1.6 times as likely 2 months after the experiment.},
  file = {/home/michaelb/Zotero/storage/SE5F68JG/Noy und Zhang - 2023 - Experimental evidence on the productivity effects of generative artificial intelligence.pdf}
}

@misc{omalleyLinuxMars2021,
  title = {Linux on {{Mars}}!},
  author = {O'Malley, James},
  year = {2021},
  month = aug,
  journal = {ITPro},
  urldate = {2024-03-11},
  abstract = {Open-source software on the Perseverance mission is helping NASA explore a GNU world},
  howpublished = {https://www.itpro.com/software/linux/360542/linux-on-mars},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/P97IG6R9/linux-on-mars.html}
}

@article{oomsJsonlitePackagePractical2014,
  title = {The Jsonlite Package: A Practical and Consistent Mapping between {{JSON}} Data and {{R}} Objects},
  author = {Ooms, Jeroen},
  year = {2014},
  journal = {arXiv:1403.2805 [stat.CO]},
  eprint = {1403.2805},
  primaryclass = {stat.CO},
  archiveprefix = {arXiv}
}

@manual{oomsPdftoolsTextExtraction2025,
  type = {Manual},
  title = {Pdftools: {{Text}} Extraction, Rendering and Converting of {{PDF}} Documents},
  author = {Ooms, Jeroen},
  year = {2025}
}

@article{oparindeKeyDevelopmentsGlobal2024,
  title = {Key Developments in Global Scholarly Publishing: {{Negotiating}} a Double-Edged Sword},
  shorttitle = {Key Developments in Global Scholarly Publishing},
  author = {Oparinde, Kunle and Govender, Vaneshree and Adedokun, Theophilus and Agbede, Grace Temiloluwa and Thungo, Sithabile},
  year = {2024},
  journal = {Learned Publishing},
  volume = {37},
  number = {3},
  pages = {e1604},
  issn = {1741-4857},
  doi = {10.1002/leap.1604},
  urldate = {2024-12-11},
  abstract = {Over the last few years, the publishing industry has experienced significant changes and developments, most of which have had a positive influence on scholarly publishing. For instance, the gradual popularity of open access publishing has contributed to the wider access and readership of published materials. Also, the recent development in the abilities of artificial intelligence (AI) tools to assist in the publication process is laudable for its potential. The gradual shift from print to online publication is also a commendable development in global publishing. Not without their own challenges, these developments, among others, have mostly impacted global publishing in a positive way. In the current study, the researchers' argument stems from the notion that although these developments are invaluable, there are accompanying impediments that publishing professionals as well as publishing outlets must consider. In response to these developments, role-players in the publishing industry must constantly reassess their publishing processes in order to carefully manage and negotiate what is termed by this study as a `double-edged sword' (capable of having positive and negative consequences). This study reviews existing studies, draws views from publishing experts, and seeks opinions from scholars to establish methods of negotiating some of the key developments in global publishing.},
  copyright = {{\copyright} 2024 The Authors. Learned Publishing published by John Wiley \& Sons Ltd on behalf of ALPSP.},
  langid = {english},
  keywords = {publishing professionals,scholarly publishing,transformation},
  file = {/home/michaelb/Zotero/storage/4HXZJ9T6/Oparinde et al. - 2024 - Key developments in global scholarly publishing Negotiating a double-edged sword.pdf;/home/michaelb/Zotero/storage/F47W47AW/leap.html}
}

@misc{openai_chatgpt_api_2025,
  title = {Chat Completions {{API}} ({{ChatGPT}})},
  author = {{OpenAI}},
  year = {2025}
}

@misc{ourresearchUnpaywallorgRESTAPI,
  type = {Database},
  title = {Unpaywall.Org - {{REST API}}},
  author = {{OurResearch}},
  urldate = {2025-05-08}
}

@article{p.simmonsPreregistrationWhyHow2021,
  title = {Pre-Registration: {{Why}} and {{How}}},
  shorttitle = {Pre-Registration},
  author = {P. Simmons, Joseph and D. Nelson, Leif and Simonsohn, Uri},
  year = {2021},
  journal = {Journal of Consumer Psychology},
  volume = {31},
  number = {1},
  pages = {151--162},
  issn = {1532-7663},
  doi = {10.1002/jcpy.1208},
  urldate = {2025-07-16},
  abstract = {In this article, we (1) discuss the reasons why pre-registration is a good idea, both for the field and individual researchers, (2) respond to arguments against pre-registration, (3) describe how to best write and review a pre-registration, and (4) comment on pre-registration's rapidly accelerating popularity. Along the way, we describe the (big) problem that pre-registration can solve (i.e., false positives caused by p-hacking), while also offering viable solutions to the problems that pre-registration cannot solve (e.g., hidden confounds or fraud). Pre-registration does not guarantee that every published finding will be true, but without it you can safely bet that many more will be false. It is time for our field to embrace pre-registration, while taking steps to ensure that it is done right.},
  copyright = {{\copyright} 2021 Society for Consumer Psychology},
  langid = {english},
  keywords = {Open Science,P-Hacking.,Research Integrity,Research Transparency},
  file = {/home/michaelb/Zotero/storage/LC7Y268G/P. Simmons et al. - 2021 - Pre-registration Why and How.pdf;/home/michaelb/Zotero/storage/IJZZG9NE/jcpy.html}
}

@inproceedings{pangThumbsSentimentClassification2002,
  title = {Thumbs up? {{Sentiment Classification}} Using {{Machine Learning Techniques}}},
  shorttitle = {Thumbs Up?},
  booktitle = {Proceedings of the 2002 {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}} ({{EMNLP}} 2002)},
  author = {Pang, Bo and Lee, Lillian and Vaithyanathan, Shivakumar},
  year = {2002},
  month = jul,
  pages = {79--86},
  publisher = {Association for Computational Linguistics},
  doi = {10.3115/1118693.1118704},
  urldate = {2025-07-25},
  file = {/home/michaelb/Zotero/storage/5F6LDVQI/Pang et al. - 2002 - Thumbs up Sentiment Classification using Machine Learning Techniques.pdf}
}

@article{papavlasopoulouExploringChildrensLearning2019,
  title = {Exploring Children's Learning Experience in Constructionism-Based Coding Activities through Design-Based Research},
  author = {Papavlasopoulou, Sofia and Giannakos, Michail N. and Jaccheri, Letizia},
  year = {2019},
  month = oct,
  journal = {Computers in Human Behavior},
  volume = {99},
  pages = {415--427},
  issn = {0747-5632},
  doi = {10.1016/j.chb.2019.01.008},
  urldate = {2025-07-26},
  abstract = {Over the last few years, the integration of coding activities for children in K-12 education has flourished. In addition, novel technological tools and programming environments have offered new opportunities and increased the need to design effective learning experiences. This paper presents a design-based research (DBR) approach conducted over two years, based on constructionism-based coding experiences for children, following the four stages of DBR. Three iterations (cycles) were designed and examined in total, with participants aged 8--17 years old, using mixed methods. Over the two years, we conducted workshops in which students used a block-based programming environment (i.e., Scratch) and collaboratively created a socially meaningful artifact (i.e., a game). The study identifies nine design principles that can help us to achieve higher engagement during the coding activity. Moreover, positive attitudes and high motivation were found to result in the better management of cognitive load. Our contribution lies in the theoretical grounding of the results in constructionism and the emerging design principles. In this way, we provide both theoretical and practical evidence of the value of constructionism-based coding activities.},
  keywords = {Children,Coding,Computational thinking,Constructionism,Design-based research,Engagement},
  file = {/home/michaelb/Zotero/storage/4ENFX5A7/S0747563219300184.html}
}

@article{pashlerReplicabilityCrisisOverblown2012,
  title = {Is the {{Replicability Crisis Overblown}}? {{Three Arguments Examined}}},
  shorttitle = {Is the {{Replicability Crisis Overblown}}?},
  author = {Pashler, Harold and Harris, Christine R.},
  year = {2012},
  journal = {Perspectives on Psychological Science},
  volume = {7},
  number = {6},
  pages = {531--536},
  publisher = {SAGE Publications Inc.},
  issn = {1745-6916},
  doi = {10.1177/1745691612463401},
  abstract = {We discuss three arguments voiced by scientists who view the current outpouring of concern about replicability as overblown. The first idea is that the adoption of a low alpha level (e.g., 5\%) puts reasonable bounds on the rate at which errors can enter the published literature, making false-positive effects rare enough to be considered a minor issue. This, we point out, rests on statistical misunderstanding: The alpha level imposes no limit on the rate at which errors may arise in the literature (Ioannidis, 2005b). Second, some argue that whereas direct replication attempts are uncommon, conceptual replication attempts are common-providing an even better test of the validity of a phenomenon. We contend that performing conceptual rather than direct replication attempts interacts insidiously with publication bias, opening the door to literatures that appear to confirm the reality of phenomena that in fact do not exist. Finally, we discuss the argument that errors will eventually be pruned out of the literature if the field would just show a bit of patience. We contend that there are no plausible concrete scenarios to back up such forecasts and that what is needed is not patience, but rather systematic reforms in scientific practice. {\copyright} The Author(s) 2012.},
  langid = {english},
  keywords = {publication bias,replication},
  file = {/home/michaelb/Zotero/storage/M9AXKX5T/Pashler and Harris - 2012 - Is the Replicability Crisis Overblown Three Arguments Examined.pdf;/home/michaelb/Zotero/storage/6P2J26HP/84869036789.html}
}

@manual{pedersenPatchworkComposerPlots2025,
  type = {Manual},
  title = {Patchwork: {{The}} Composer of Plots},
  author = {Pedersen, Thomas Lin},
  year = {2025}
}

@misc{PerceptualGroupingExplains,
  title = {Perceptual {{Grouping Explains Similarities}} in {{Constellations Across Cultures}} - {{Charles Kemp}}, {{Duane W}}. {{Hamacher}}, {{Daniel R}}. {{Little}}, {{Simon J}}. {{Cropper}}, 2022},
  urldate = {2025-07-24},
  howpublished = {https://journals.sagepub.com/doi/10.1177/09567976211044157},
  file = {/home/michaelb/Zotero/storage/7AJD77XS/09567976211044157.html}
}

@article{piwowarSharingDetailedResearch2007,
  title = {Sharing {{Detailed Research Data Is Associated}} with {{Increased Citation Rate}}},
  author = {Piwowar, Heather and Day, Roger S. and Fridsma, Douglas B.},
  year = {2007},
  month = mar,
  journal = {PLOS ONE},
  volume = {2},
  number = {3},
  pages = {e308},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0000308},
  urldate = {2025-08-28},
  abstract = {BackgroundSharing research data provides benefit to the general scientific community, but the benefit is less obvious for the investigator who makes his or her data available.Principal FindingsWe examined the citation history of 85 cancer microarray clinical trial publications with respect to the availability of their data. The 48\% of trials with publicly available microarray data received 85\% of the aggregate citations. Publicly available data was significantly (p = 0.006) associated with a 69\% increase in citations, independently of journal impact factor, date of publication, and author country of origin using linear regression.SignificanceThis correlation between publicly available data and increased literature impact may further motivate investigators to share their detailed research data.},
  langid = {english},
  keywords = {Bibliometrics,Cancers and neoplasms,Citation analysis,Clinical trials (cancer treatment),Linear regression analysis,Microarrays,Open data,Scientific publishing},
  file = {/home/michaelb/Zotero/storage/RMJ6V623/Piwowar et al. - 2007 - Sharing Detailed Research Data Is Associated with Increased Citation Rate.pdf}
}

@article{piwowarStateOALargescale2018,
  title = {The State of {{OA}}: A Large-Scale Analysis of the Prevalence and Impact of {{Open Access}} Articles},
  shorttitle = {The State of {{OA}}},
  author = {Piwowar, Heather and Priem, Jason and Larivi{\`e}re, Vincent and Alperin, Juan Pablo and Matthias, Lisa and Norlander, Bree and Farley, Ashley and West, Jevin and Haustein, Stefanie},
  year = {2018},
  month = feb,
  journal = {PeerJ},
  volume = {6},
  pages = {e4375},
  publisher = {PeerJ Inc.},
  issn = {2167-8359},
  doi = {10.7717/peerj.4375},
  urldate = {2025-08-28},
  abstract = {Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28\% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45\%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users encounter OA quite frequently: 47\% of articles they view are OA. Notably, the most common mechanism for OA is not Gold, Green, or Hybrid OA, but rather an under-discussed category we dub Bronze: articles made free-to-read on the publisher website, without an explicit Open license. We also examine the citation impact of OA articles, corroborating the so-called open-access citation advantage: accounting for age and discipline, OA articles receive 18\% more citations than average, an effect driven primarily by Green and Hybrid OA. We encourage further research using the free oaDOI service, as a way to inform OA policy and practice.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/QC7KF7QW/Piwowar et al. - 2018 - The state of OA a large-scale analysis of the prevalence and impact of Open Access articles.pdf}
}

@book{popperLogicScientificDiscovery2005,
  title = {The {{Logic}} of {{Scientific Discovery}}},
  author = {Popper, Karl},
  year = {2005},
  month = nov,
  edition = {2},
  publisher = {Routledge},
  address = {London},
  doi = {10.4324/9780203994627},
  abstract = {Described by the philosopher A.J. Ayer as a work of 'great originality and power', this book revolutionized contemporary thinking on science and knowledge. Ideas such as~the now legendary doctrine of 'falsificationism' electrified the scientific community, influencing even working scientists, as well as post-war philosophy. This astonishing work ranks alongside The Open Society and Its Enemies as one of Popper's most enduring books and contains insights and arguments that demand to be read to this day.},
  isbn = {978-0-203-99462-7},
  file = {/home/michaelb/Zotero/storage/ETAI2LMN/Popper - 2005 - The Logic of Scientific Discovery.pdf}
}

@article{pridemoreReplicationCriminologySocial2018,
  title = {Replication in {{Criminology}} and the {{Social Sciences}}},
  author = {Pridemore, William Alex and Makel, Matthew C. and Plucker, Jonathan A.},
  year = {2018},
  month = jan,
  journal = {Annual Review of Criminology},
  volume = {1},
  number = {Volume 1, 2018},
  pages = {19--38},
  publisher = {Annual Reviews},
  issn = {2572-4568},
  doi = {10.1146/annurev-criminol-032317-091849},
  urldate = {2024-11-06},
  abstract = {Replication is a hallmark of science. In recent years, some medical sciences and behavioral sciences struggled with what came to be known as replication crises. As a field, criminology has yet to address formally the threats to our evidence base that might be posed by large-scale and systematic replication attempts, although it is likely we would face challenges similar to those experienced by other disciplines. In this review, we outline the basics of replication, summarize reproducibility problems found in other fields, undertake an original analysis of the amount and nature of replication studies appearing in criminology journals, and consider how criminology can begin to assess more formally the robustness of our knowledge through encouraging a culture of replication.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/AFXT4GLV/Pridemore et al. - 2018 - Replication in Criminology and the Social Sciences.pdf;/home/michaelb/Zotero/storage/NS8BQ7GK/annurev-criminol-032317-091849.html}
}

@article{pudjihartonoReviewFeatureSelection2022,
  title = {A {{Review}} of {{Feature Selection Methods}} for {{Machine Learning-Based Disease Risk Prediction}}},
  author = {Pudjihartono, Nicholas and Fadason, Tayaza and {Kempa-Liehr}, Andreas W. and O'Sullivan, Justin M.},
  year = {2022},
  month = jun,
  journal = {Frontiers in Bioinformatics},
  volume = {2},
  publisher = {Frontiers},
  issn = {2673-7647},
  doi = {10.3389/fbinf.2022.927312},
  urldate = {2025-07-25},
  abstract = {Machine learning has shown utility in detecting patterns within large, unstructured, and complex datasets. One of the promising applications of machine learning is in precision medicine, where disease risk is predicted using patient genetic data. However, creating an accurate prediction model based on genotype data remains challenging due to the so-called ``curse of dimensionality'' (i.e., extensively larger number of features compared to the number of samples). Therefore, the generalizability of machine learning models benefits from feature selection, which aims to extract only the most ``informative'' features and remove noisy ``non-informative,'' irrelevant and redundant features. In this article, we provide a general overview of the different feature selection methods, their advantages, disadvantages, and use cases, focusing on the detection of relevant features (i.e., SNPs) for disease risk prediction.},
  langid = {english},
  keywords = {Disease risk prediction,Feature selection (FS),Machine Learing,risk prediction,Statistical  Approaches},
  file = {/home/michaelb/Zotero/storage/9RAEWP6A/Pudjihartono et al. - 2022 - A Review of Feature Selection Methods for Machine Learning-Based Disease Risk Prediction.pdf}
}

@article{QualitativeResearchCriminology2025,
  title = {Qualitative Research in Criminology},
  year = {2025},
  month = may,
  journal = {Wikipedia},
  urldate = {2025-07-25},
  abstract = {Qualitative research in criminology consists of research in the criminology field that employs qualitative methods. There are many applications of this research, and they can often intersect with quantitative research in criminology in order to create mixed method studies.    This type of research is key to holistic views of criminological theory (theories of crime, or within the field of criminology), as it is much more capable of establishing context than empirical data alone. There are also some academics who consider qualitative research to be the superior method of research in criminology, yet this does not mean that it is more commonly used. In fact, quantitative research is much more frequently published in criminology journals.},
  copyright = {Creative Commons Attribution-ShareAlike License},
  langid = {english},
  annotation = {Page Version ID: 1292101377},
  file = {/home/michaelb/Zotero/storage/LAT927PV/index.html}
}

@misc{RadialBasisFunction,
  title = {Radial Basis Function Support Vector Machines ({{SVMs}}) via Kernlab --- Details\_svm\_rbf\_kernlab},
  urldate = {2025-08-01},
  abstract = {kernlab::ksvm() fits a support vector machine model. For classification, the model tries to maximize the width of the margin between classes. For regression, the model optimizes a robust loss function that is only affected by very large model residuals.},
  howpublished = {https://parsnip.tidymodels.org/reference/details\_svm\_rbf\_kernlab.html},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/VPDPEGBS/details_svm_rbf_kernlab.html}
}

@article{ramakrishnanLayoutawareTextExtraction2012,
  title = {Layout-Aware Text Extraction from Full-Text {{PDF}} of Scientific Articles},
  author = {Ramakrishnan, Cartic and Patnia, Abhishek and Hovy, Eduard and Burns, Gully APC},
  year = {2012},
  month = may,
  journal = {Source Code for Biology and Medicine},
  volume = {7},
  pages = {7},
  issn = {1751-0473},
  doi = {10.1186/1751-0473-7-7},
  urldate = {2025-07-31},
  abstract = {Background The Portable Document Format (PDF) is the most commonly used file format for online scientific publications. The absence of effective means to extract text from these PDF files in a layout-aware manner presents a significant challenge for developers of biomedical text mining or biocuration informatics systems that use published literature as an information source. In this paper we introduce the `Layout-Aware PDF Text Extraction' (LA-PDFText) system to facilitate accurate extraction of text from PDF files of research articles for use in text mining applications. Results Our paper describes the construction and performance of an open source system that extracts text blocks from PDF-formatted full-text research articles and classifies them into logical units based on rules that characterize specific sections. The LA-PDFText system focuses only on the textual content of the research articles and is meant as a baseline for further experiments into more advanced extraction methods that handle multi-modal content, such as images and graphs. The system works in a three-stage process: (1) Detecting contiguous text blocks using spatial layout processing to locate and identify blocks of contiguous text, (2) Classifying text blocks into rhetorical categories using a rule-based method and (3) Stitching classified text blocks together in the correct order resulting in the extraction of text from section-wise grouped blocks. We show that our system can identify text blocks and classify them into rhetorical categories with Precision1\,=\,0.96\% Recall\,=\,0.89\% and F1\,=\,0.91\%. We also present an evaluation of the accuracy of the block detection algorithm used in step 2. Additionally, we have compared the accuracy of the text extracted by LA-PDFText to the text from the Open Access subset of PubMed Central. We then compared this accuracy with that of the text extracted by the PDF2Text system, 2commonly used to extract text from PDF. Finally, we discuss preliminary error analysis for our system and identify further areas of improvement. Conclusions LA-PDFText is an open-source tool for accurately extracting text from full-text scientific articles. The release of the system is available at http://code.google.com/p/lapdftext/.},
  pmcid = {PMC3441580},
  pmid = {22640904},
  file = {/home/michaelb/Zotero/storage/83PI52ZE/Ramakrishnan et al. - 2012 - Layout-aware text extraction from full-text PDF of scientific articles.pdf}
}

@manual{rcoreteamLanguageEnvironmentStatistical2025,
  type = {Manual},
  title = {R: A Language and Environment for Statistical Computing},
  author = {{R Core Team}},
  year = {2025},
  address = {Vienna, Austria},
  institution = {R Foundation for Statistical Computing}
}

@misc{ReadDeclarationBudapest,
  title = {Read the {{Declaration}} -- {{Budapest Open Access Initiative}}},
  urldate = {2025-08-28},
  file = {/home/michaelb/Zotero/storage/AD8FFQ5A/read.html}
}

@misc{ribasMachineLearningOpenAI2025,
  title = {Machine {{Learning}} + {{OpenAI}}: {{Solving}} a {{Text Classification Problem}}},
  shorttitle = {Machine {{Learning}} + {{OpenAI}}},
  author = {Ribas, Ricardo},
  year = {2025},
  month = jan,
  journal = {TDS Archive},
  urldate = {2025-04-02},
  abstract = {How I migrated an old solution to a more elegant, robust and scalable solution using text classification from openAI},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/P3BI6WC9/machine-learning-how-i-solved-a-text-classification-problem-cddae9c025e7.html}
}

@misc{rinkerTrinkerTextstem2025,
  title = {Trinker/Textstem},
  author = {Rinker, Tyler},
  year = {2025},
  month = may,
  urldate = {2025-08-01},
  abstract = {Tools for fast text stemming \& lemmatization},
  keywords = {lemmatization,r,stemming,text-mining}
}

@article{RJ-2021-048,
  title = {A Unifying Framework for Parallel and Distributed Processing in {{R}} Using Futures},
  author = {Bengtsson, Henrik},
  year = {2021},
  journal = {The R Journal},
  volume = {13},
  number = {2},
  pages = {208--227},
  doi = {10.32614/RJ-2021-048}
}

@article{robertsCrossvalidationStrategiesData2017,
  title = {Cross-Validation Strategies for Data with Temporal, Spatial, Hierarchical, or Phylogenetic Structure},
  author = {Roberts, David R. and Bahn, Volker and Ciuti, Simone and Boyce, Mark S. and Elith, Jane and {Guillera-Arroita}, Gurutzeta and Hauenstein, Severin and {Lahoz-Monfort}, Jos{\'e} J. and Schr{\"o}der, Boris and Thuiller, Wilfried and Warton, David I. and Wintle, Brendan A. and Hartig, Florian and Dormann, Carsten F.},
  year = {2017},
  journal = {Ecography},
  volume = {40},
  number = {8},
  pages = {913--929},
  issn = {1600-0587},
  doi = {10.1111/ecog.02881},
  urldate = {2025-07-16},
  abstract = {Ecological data often show temporal, spatial, hierarchical (random effects), or phylogenetic structure. Modern statistical approaches are increasingly accounting for such dependencies. However, when performing cross-validation, these structures are regularly ignored, resulting in serious underestimation of predictive error. One cause for the poor performance of uncorrected (random) cross-validation, noted often by modellers, are dependence structures in the data that persist as dependence structures in model residuals, violating the assumption of independence. Even more concerning, because often overlooked, is that structured data also provides ample opportunity for overfitting with non-causal predictors. This problem can persist even if remedies such as autoregressive models, generalized least squares, or mixed models are used. Block cross-validation, where data are split strategically rather than randomly, can address these issues. However, the blocking strategy must be carefully considered. Blocking in space, time, random effects or phylogenetic distance, while accounting for dependencies in the data, may also unwittingly induce extrapolations by restricting the ranges or combinations of predictor variables available for model training, thus overestimating interpolation errors. On the other hand, deliberate blocking in predictor space may also improve error estimates when extrapolation is the modelling goal. Here, we review the ecological literature on non-random and blocked cross-validation approaches. We also provide a series of simulations and case studies, in which we show that, for all instances tested, block cross-validation is nearly universally more appropriate than random cross-validation if the goal is predicting to new data or predictor space, or for selecting causal predictors. We recommend that block cross-validation be used wherever dependence structures exist in a dataset, even if no correlation structure is visible in the fitted model residuals, or if the fitted models account for such correlations.},
  copyright = {{\copyright} 2016 The Authors},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/KV625GRJ/Roberts et al. - 2017 - Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure.pdf}
}

@article{rosenthalFileDrawerProblem1979,
  title = {The File Drawer Problem and Tolerance for Null Results},
  author = {Rosenthal, Robert},
  year = {1979},
  journal = {Psychological Bulletin},
  volume = {86},
  number = {3},
  pages = {638--641},
  publisher = {American Psychological Association},
  address = {US},
  issn = {1939-1455},
  doi = {10.1037/0033-2909.86.3.638},
  abstract = {For any given research area, one cannot tell how many studies have been conducted but never reported. The extreme view of the "file drawer problem" is that journals are filled with the 5\% of the studies that show Type I errors, while the file drawers are filled with the 95\% of the studies that show nonsignificant results. Quantitative procedures for computing the tolerance for filed and future null results are reported and illustrated, and the implications are discussed. (15 ref) (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  keywords = {Experimentation,Scientific Communication,Statistical Probability,Statistical Tests,Type I Errors},
  file = {/home/michaelb/Zotero/storage/3Z39JCGR/doiLanding.html}
}

@article{rowbottomKuhnVsPopper2011,
  title = {Kuhn vs. {{Popper}} on Criticism and Dogmatism in Science: A Resolution at the Group Level},
  shorttitle = {Kuhn vs. {{Popper}} on Criticism and Dogmatism in Science},
  author = {Rowbottom, Darrell P.},
  year = {2011},
  month = mar,
  journal = {Studies in History and Philosophy of Science Part A},
  volume = {42},
  number = {1},
  pages = {117--124},
  issn = {0039-3681},
  doi = {10.1016/j.shpsa.2010.11.031},
  urldate = {2024-12-13},
  abstract = {Popper repeatedly emphasised the significance of a critical attitude, and a related critical method, for scientists. Kuhn, however, thought that unquestioning adherence to the theories of the day is proper; at least for `normal scientists'. In short, the former thought that dominant theories should be attacked, whereas the latter thought that they should be developed and defended (for the vast majority of the time). Both seem to have missed a trick, however, due to their apparent insistence that each individual scientist should fulfil similar functions (at any given point in time). The trick is to consider science at the group level; and doing so shows how puzzle solving and `offensive' critical activity can simultaneously have a legitimate place in science. This analysis shifts the focus of the debate. The crucial question becomes `How should the balance between functions be struck?'},
  file = {/home/michaelb/Zotero/storage/KFYID48X/Rowbottom - 2011 - Kuhn vs. Popper on criticism and dogmatism in science a resolution at the group level.pdf;/home/michaelb/Zotero/storage/PUFCMHFR/S003936811000110X.html}
}

@inproceedings{sanguansatFeatureMatricizationDocument2012,
  title = {Feature Matricization for Document Classification},
  booktitle = {2012 {{IEEE International Conference}} on {{Signal Processing}}, {{Communication}} and {{Computing}} ({{ICSPCC}} 2012)},
  author = {Sanguansat, Parinya},
  year = {2012},
  month = aug,
  pages = {745--749},
  doi = {10.1109/ICSPCC.2012.6335622},
  urldate = {2024-12-16},
  abstract = {Generally, the dimension of feature vector in text classification depends on the number of words in the specific domain. Many documents of considered categories make it numerous. Therefore, the dimension of feature vector is very high that makes it consumes a lot of time and memory to process. Moreover, it is a cause of the small sample size problem when the number of available training documents is far smaller than the dimension of these feature vectors. This paper proposes the alternative technique of dimensionality reduction for the feature vector in two-dimensional manner by previously transforming the feature vector to the feature matrix and then using Two-Dimensional Principal Component Analysis (2DPCA) for reducing the dimension of this feature matrix. Based on 2DPCA, the original weighted term matrix is not necessary to store in the memory anymore because the scatter matrix of 2DPCA can be computed incrementally. The small reduction in matrix form impacts to the plenty of dimensionality reduction in vector form. From the experimental results on well-known dataset, the proposed method not only significantly reduce the dimensionality but also achieve the higher accuracy rate than the original feature space.},
  keywords = {Accuracy,Covariance matrix,Document classification,Feature extraction,Machine learning,Matricization,Principal component analysis,Support vector machines,Vectors},
  file = {/home/michaelb/Zotero/storage/E4GPLWP6/Sanguansat - 2012 - Feature matricization for document classification.pdf;/home/michaelb/Zotero/storage/MY7DF92Q/6335622.html}
}

@article{sarabeevSampleSizeMatters2025,
  title = {The Sample Size Matters: Evaluating Minimum and Reasonable Values in Prevalence Studies},
  shorttitle = {The Sample Size Matters},
  author = {Sarabeev, Volodimir and Shvydka, Svitlana and Lisitsyna, Olga and Oros, Mikul{\'a}{\v s} and Miterp{\'a}kov{\'a}, Martina and {\v Z}d{\'i}malov{\'a}, M{\'a}ria},
  year = {2025},
  month = may,
  journal = {International Journal for Parasitology},
  issn = {0020-7519},
  doi = {10.1016/j.ijpara.2025.05.003},
  urldate = {2025-08-03},
  abstract = {Estimating sample size is important for prevalence studies, as it directly influences the validity of the research outcomes. Our objective was to highlight constraints in the prevalence assessment and to provide guidance on the delineation of minimum and reasonable sample size. We also assess the prevalence properties as a function of sample size visualizing the median prevalence, confidence intervals, precision, and changes in precision. Constraint analysis indicates that a sample size of less than 15 host individuals will likely result in unacceptable precision in the most cases. Because the prevalence estimate accuracy depends on both sample size and the estimate itself, the minimum sample size may vary widely, from 16 to over 450 individuals, when the prevalence is between 1\% and 99\%. A sample size of 16--45 elements can be used as a minimum for estimating true prevalence between 10\% and 90\% with an acceptable precision. However, caution should be exercised with a such small sample size as the prevalence will have a high degree of uncertainty. A simple, practical suggestion for selecting a minimum sample size is to sample until at least 5 infected (cases) and 5 uninfected (non-cases) hosts are detected. This approach is effective in most situations, except in cases of extreme prevalence (1\% or 99\%). The design of a reasonable sample size should be based on a flexible strategy that takes into account the study objectives, available resources and desired precision. This strategy may based on finding the plateau phase within the precision or confidence intervals curves. As the uncertainty in prevalence decreases rapidly with increasing sample up to 110--135 individuals, but not much more with further increasing sample efforts, opting for a sample size exceeding this threshold, could be considered an optional choice within the prevalence range of 5--95\%. We advise authors, editors and reviewers to track sample size in conjunction with the actual prevalence of the parasites and other pathogens. If the minimum sample size is unattainable, authors should acknowledge this limitation, as all data contribute to understanding parasite distribution.},
  keywords = {Bag of Little Bootstraps,Bootstrap median,Confidence interval,Non-parametric bootstrap,Precision,Prevalence,Sample size determination},
  file = {/home/michaelb/Zotero/storage/XW7VXHJQ/S0020751925000943.html}
}

@article{sarafoglouSurveyHowPreregistration2022,
  title = {A Survey on How Preregistration Affects the Research Workflow: Better Science but More Work},
  shorttitle = {A Survey on How Preregistration Affects the Research Workflow},
  author = {Sarafoglou, Alexandra and Kovacs, Marton and Bakos, Bence and Wagenmakers, Eric-Jan and Aczel, Balazs},
  year = {2022},
  month = jul,
  journal = {Royal Society Open Science},
  volume = {9},
  number = {7},
  pages = {211997},
  publisher = {Royal Society},
  doi = {10.1098/rsos.211997},
  urldate = {2024-11-06},
  abstract = {The preregistration of research protocols and analysis plans is a main reform innovation to counteract confirmation bias in the social and behavioural sciences. While theoretical reasons to preregister are frequently discussed in the literature, the individually experienced advantages and disadvantages of this method remain largely unexplored. The goal of this exploratory study was to identify the perceived benefits and challenges of preregistration from the researcher's perspective. To this end, we surveyed 355 researchers, 299 of whom had used preregistration in their own work. The researchers indicated the experienced or expected effects of preregistration on their workflow. The results show that experiences and expectations are mostly positive. Researchers in our sample believe that implementing preregistration improves or is likely to improve the quality of their projects. Criticism of preregistration is primarily related to the increase in work-related stress and the overall duration of the project. While the benefits outweighed the challenges for the majority of researchers with preregistration experience, this was not the case for the majority of researchers without preregistration experience. The experienced advantages and disadvantages identified in our survey could inform future efforts to improve preregistration and thus help the methodology gain greater acceptance in the scientific community.},
  keywords = {meta-science,open science,replication crisis},
  file = {/home/michaelb/Zotero/storage/4V4JQAPT/Sarafoglou et al. - 2022 - A survey on how preregistration affects the research workflow better science but more work.pdf}
}

@article{savolainenReplicationResearchIntegrity2018,
  title = {Replication and {{Research Integrity}} in {{Criminology}}: {{Introduction}} to the {{Special Issue}}},
  shorttitle = {Replication and {{Research Integrity}} in {{Criminology}}},
  author = {Savolainen, Jukka and VanEseltine, Matthew},
  year = {2018},
  month = aug,
  journal = {Journal of Contemporary Criminal Justice},
  volume = {34},
  number = {3},
  pages = {236--244},
  publisher = {SAGE Publications Inc},
  issn = {1043-9862},
  doi = {10.1177/1043986218777288},
  urldate = {2024-11-06},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/JEEQLPSS/Savolainen and VanEseltine - 2018 - Replication and Research Integrity in Criminology Introduction to the Special Issue.pdf}
}

@article{schererMetaanalysisTeachingLearning2020,
  title = {A Meta-Analysis of Teaching and Learning Computer Programming: {{Effective}} Instructional Approaches and Conditions},
  shorttitle = {A Meta-Analysis of Teaching and Learning Computer Programming},
  author = {Scherer, Ronny and Siddiq, Fazilat and S{\'a}nchez Viveros, B{\'a}rbara},
  year = {2020},
  month = aug,
  journal = {Computers in Human Behavior},
  volume = {109},
  pages = {106349},
  issn = {0747-5632},
  doi = {10.1016/j.chb.2020.106349},
  urldate = {2025-08-07},
  abstract = {This meta-analysis maps the evidence on the effectiveness of instructional approaches and conditions for learning computer programming under three study conditions: (a) Studies focusing on the effectiveness of programming interventions per se, (b) studies focusing on the effectiveness of visualization and physicality, and (c) studies focusing on the effectiveness of dominant instructional approaches. Utilizing the data from 139 interventions and 375 effect sizes, we found (a) a strong effect of learning computer programming per se (Hedges' g-~=~0.81, 95\% CI [0.42, 1.21]), (b) moderate to large effect sizes of visualization (g-~=~0.44, 95\% CI [0.29, 0.58]) and physicality interventions (g-~=~0.72, 95\% CI [0.23, 1.21]), and (c) moderate to large effect sizes for studies focusing on dominant instructional approaches (g-s~=~0.49--1.02). Moderator analyses indicated that the effect sizes differed only marginally between the instructional approaches and conditions---however, collaboration in metacognition instruction, problem solving instruction outside of regular lessons, short-term interventions focusing on physicality, and interventions focusing on visualization through Scratch were especially effective. Our meta-analysis synthesizes the existing research evidence on the effectiveness of computer programming instruction and, ultimately, provides references with which the effects of future studies could be compared.},
  keywords = {Computational thinking,Computer programming,Intervention studies,Multilevel meta-analysis,Scratch programming},
  file = {/home/michaelb/Zotero/storage/6Y4GDH6S/S0747563220301023.html}
}

@article{schonlauRandomForestAlgorithm2020,
  title = {The Random Forest Algorithm for Statistical Learning},
  author = {Schonlau, Matthias and Zou, Rosie Yuyan},
  year = {2020},
  month = mar,
  journal = {The Stata Journal},
  volume = {20},
  number = {1},
  pages = {3--29},
  publisher = {SAGE Publications},
  issn = {1536-867X},
  doi = {10.1177/1536867X20909688},
  urldate = {2025-08-01},
  abstract = {Random forests (Breiman, 2001, Machine Learning 45: 5--32) is a statistical- or machine-learning algorithm for prediction. In this article, we introduce a corresponding new command, rforest. We overview the random forest algorithm and illustrate its use with two examples: The first example is a classification problem that predicts whether a credit card holder will default on his or her debt. The second example is a regression problem that predicts the logscaled number of shares of online news articles. We conclude with a discussion that summarizes key points demonstrated in the examples.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/KAIMX8Z8/Schonlau and Zou - 2020 - The random forest algorithm for statistical learning.pdf}
}

@article{schummResearchAnomaliesCriminology2025,
  title = {Research Anomalies in Criminology: {{How}} Serious? {{How}} Extensive over Time? {{And}} Who Was Responsible?},
  shorttitle = {Research Anomalies in Criminology},
  author = {Schumm, Walter R. and Crawford, Duane W. and Lockett, Lorenza and AlRashed, Abdullah and Bin Ateeq, Asma},
  year = {2025},
  month = jan,
  journal = {Accountability in Research},
  volume = {32},
  number = {1},
  pages = {22--58},
  publisher = {Taylor \& Francis},
  issn = {0898-9621},
  doi = {10.1080/08989621.2023.2241127},
  urldate = {2025-08-26},
  abstract = {A variety of ways to detect questionable research practices in small sample social science surveys have been discussed by a variety of authors. However, some of those approaches (e.g., GRIM test, SPRITE test) do not work well for results obtained from larger samples. Here several approaches for detecting anomalies in larger samples are presented and illustrated by an analysis of 78 journal articles in the area of criminology, 59 by Dr. Eric Stewart, published since 1998 with similar methods and/or authors. Of all 59 articles, 28 (47.5\%, p {$<$} .001, d = 0.94) had two or more major anomalies compared to none of the 19 control group articles. It was also found that the larger the role of Dr. Stewart in article authorship, the greater the number of anomalies detected (p {$<$} .001, d = 1.01) while for his coauthors, there were few significant relationships between their roles and total anomalies. Our results demonstrate that extensive problematic results can remain undetected for decades despite several levels of peer review and other scientific controls; however, use of two types of control groups and the use of statistical methods for measuring and evaluating anomalies can improve detection.},
  pmid = {37498056},
  keywords = {anomalies,Fraud,Professor Eric Stewart,research misconduct,retractions}
}

@article{scogginsMeasuringTransparencySocial2024,
  title = {Measuring Transparency in the Social Sciences: Political Science and International Relations},
  shorttitle = {Measuring Transparency in the Social Sciences},
  author = {Scoggins, Bermond and Robertson, Matthew P.},
  year = {2024},
  month = jul,
  journal = {Royal Society Open Science},
  volume = {11},
  number = {7},
  pages = {240313},
  publisher = {Royal Society},
  doi = {10.1098/rsos.240313},
  urldate = {2024-11-16},
  abstract = {The scientific method is predicated on transparency---yet the pace at which transparent research practices are being adopted by the scientific community is slow. The replication crisis in psychology showed that published findings employing statistical inference are threatened by undetected errors, data manipulation and data falsification. To mitigate these problems and bolster research credibility, open data and preregistration practices have gained traction in the natural and social sciences. However, the extent of their adoption in different disciplines is unknown. We introduce computational procedures to identify the transparency of a research field using large-scale text analysis and machine learning classifiers. Using political science and international relations as an illustrative case, we examine 93 931 articles across the top 160 political science and international relations journals between 2010 and 2021. We find that approximately 21\% of all statistical inference papers have open data and 5\% of all experiments are preregistered. Despite this shortfall, the example of leading journals in the field shows that change is feasible and can be effected quickly.},
  keywords = {data sharing,journal policy,open science,preregistration},
  file = {/home/michaelb/Zotero/storage/LZVK24S3/Scoggins and Robertson - 2024 - Measuring transparency in the social sciences political science and international relations.pdf}
}

@inproceedings{settlesActiveLearningLiterature2009,
  title = {Active {{Learning Literature Survey}}},
  author = {Settles, Burr},
  year = {2009},
  urldate = {2025-03-28},
  abstract = {The key idea behind active learning is that a machine learning algorithm can achieve greater accuracy with fewer labeled training instances if it is allowed to choose the data from which is learns. An active learner may ask queries in the form of unlabeled instances to be labeled by an oracle (e.g., a human annotator). Active learning is well-motivated in many modern machine learning problems, where unlabeled data may be abundant but labels are difficult, time-consuming, or expensive to obtain. This report provides a general introduction to active learning and a survey of the literature. This includes a discussion of the scenarios in which queries can be formulated, and an overview of the query strategy frameworks proposed in the literature to date. An analysis of the empirical and theoretical evidence for active learning, a summary of several problem setting variants, and a discussion of related topics in machine learning research are also presented.},
  file = {/home/michaelb/Zotero/storage/MM9LG7EU/Settles - 2009 - Active Learning Literature Survey.pdf}
}

@article{shermanSpecificDeterrentEffects1984,
  title = {The {{Specific Deterrent Effects}} of {{Arrest}} for {{Domestic Assault}}},
  author = {Sherman, Lawrence W. and Berk, Richard A.},
  year = {1984},
  month = apr,
  journal = {American Sociological Review},
  volume = {49},
  number = {2},
  eprint = {2095575},
  eprinttype = {jstor},
  pages = {261},
  issn = {00031224},
  doi = {10.2307/2095575},
  urldate = {2025-08-29}
}

@inproceedings{shilaneAutomatedFeatureReduction2022,
  title = {Automated {{Feature Reduction}} in {{Machine Learning}}},
  booktitle = {2022 {{IEEE}} 12th {{Annual Computing}} and {{Communication Workshop}} and {{Conference}} ({{CCWC}})},
  author = {Shilane, David},
  year = {2022},
  month = jan,
  pages = {0045--0049},
  doi = {10.1109/CCWC54503.2022.9720821},
  urldate = {2025-07-25},
  abstract = {Supervised machine learning models are developed by selecting features that can be related to a dependent variable. This selection may involve a mix of automated procedures and manual exploration. Large data sets often include many features that are not suitable for the intended model. This paper proposes a framework for automated feature reduction (AFR) to identify features that can be removed from consideration for the model. It applies exclusion criteria based on the form of the data to determine whether each feature would pose practical issues for the design of the model. AFR is proposed as a preprocessing step prior to the use of automated or manual feature selection techniques. By reducing the set of possible features, this exploration can search a less complex space of potentially relevant features. AFR can be widely applied to many machine learning models and demonstrates clear benefits in improving the efficiency of feature selection algorithms. This paper provides details on the AFR algorithm along with examples of feature reduction in model development.},
  keywords = {big data,Computational modeling,Conferences,Data models,Feature extraction,feature reduction,feature selection,machine learning,Machine learning,Machine learning algorithms,Manuals},
  file = {/home/michaelb/Zotero/storage/96WH9T8H/9720821.html}
}

@article{shroutPsychologyScienceKnowledge2018,
  title = {Psychology, {{Science}}, and {{Knowledge Construction}}: {{Broadening Perspectives}} from the {{Replication Crisis}}},
  shorttitle = {Psychology, {{Science}}, and {{Knowledge Construction}}},
  author = {Shrout, Patrick E. and Rodgers, Joseph L.},
  year = {2018},
  journal = {Annual Review of Psychology},
  volume = {69},
  pages = {487--510},
  publisher = {Annual Reviews Inc.},
  issn = {0066-4308},
  doi = {10.1146/annurev-psych-122216-011845},
  abstract = {Psychology advances knowledge by testing statistical hypotheses using empirical observations and data. The expectation is that most statistically significant findings can be replicated in new data and in new laboratories, but in practice many findings have replicated less often than expected, leading to claims of a replication crisis. We review recent methodological literature on questionable research practices, meta-analysis, and power analysis to explain the apparently high rates of failure to replicate. Psychologists can improve research practices to advance knowledge in ways that improve replicability. We recommend that researchers adopt open science conventions of preregi-stration and full disclosure and that replication efforts be based on multiple studies rather than on a single replication attempt. We call for more sophisticated power analyses, careful consideration of the various influences on effect sizes, and more complete disclosure of nonsignificant as well as statistically significant findings. Copyright {\copyright}2018 by Annual Reviews. All rights reserved.},
  langid = {english},
  keywords = {Methodology,Replication,Statistics},
  file = {/home/michaelb/Zotero/storage/5VIKSUMJ/Shrout and Rodgers - 2018 - Psychology, Science, and Knowledge Construction Broadening Perspectives from the Replication Crisis.pdf;/home/michaelb/Zotero/storage/NM987F67/85040343391.html}
}

@article{siinoTextPreprocessingStill2024,
  title = {Is Text Preprocessing Still Worth the Time? {{A}} Comparative Survey on the Influence of Popular Preprocessing Methods on {{Transformers}} and Traditional Classifiers},
  shorttitle = {Is Text Preprocessing Still Worth the Time?},
  author = {Siino, Marco and Tinnirello, Ilenia and La Cascia, Marco},
  year = {2024},
  month = mar,
  journal = {Information Systems},
  volume = {121},
  pages = {102342},
  issn = {0306-4379},
  doi = {10.1016/j.is.2023.102342},
  urldate = {2025-08-01},
  abstract = {With the advent of the modern pre-trained Transformers, the text preprocessing has started to be neglected and not specifically addressed in recent NLP literature. However, both from a linguistic and from a computer science point of view, we believe that even when using modern Transformers, text preprocessing can significantly impact on the performance of a classification model. We want to investigate and compare, through this study, how preprocessing impacts on the Text Classification (TC) performance of modern and traditional classification models. We report and discuss the preprocessing techniques found in the literature and their most recent variants or applications to address TC tasks in different domains. In order to assess how much the preprocessing affects classification performance, we apply the three top referenced preprocessing techniques (alone or in combination) to four publicly available datasets from different domains. Then, nine machine learning models -- including modern Transformers -- get the preprocessed text as input. The results presented show that an educated choice on the text preprocessing strategy to employ should be based on the task as well as on the model considered. Outcomes in this survey show that choosing the best preprocessing technique -- in place of the worst -- can significantly improve accuracy on the classification (up to 25\%, as in the case of an XLNet on the IMDB dataset). In some cases, by means of a suitable preprocessing strategy, even a simple Na{\"i}ve Bayes classifier proved to outperform (i.e., by 2\% in accuracy) the best performing Transformer. We found that Transformers and traditional models exhibit a higher impact of the preprocessing on the TC performance. Our main findings are: (1) also on modern pre-trained language models, preprocessing can affect performance, depending on the datasets and on the preprocessing technique or combination of techniques used, (2) in some cases, using a proper preprocessing strategy, simple models can outperform Transformers on TC tasks, (3) similar classes of models exhibit similar level of sensitivity to text preprocessing.},
  keywords = {Bayes,Convolutional neural networks,Deep learning,Fake news,LSTM,Natural Language Processing,SVM,Text preprocessing,Transformers},
  file = {/home/michaelb/Zotero/storage/4FHRILU3/S0306437923001783.html}
}

@book{silge8FeatureEngineering,
  title = {8 {{Feature Engineering}} with Recipes {\textbar} {{Tidy Modeling}} with {{R}}},
  author = {Silge, Max Kuhn {and} Julia},
  urldate = {2025-07-23},
  abstract = {The tidymodels framework is a collection of R packages for modeling and machine learning using tidyverse principles. This book provides a thorough introduction to how to use tidymodels, and an outline of good methodology and statistical practice for phases of the modeling process.},
  file = {/home/michaelb/Zotero/storage/ZWQLUXN2/recipes.html}
}

@article{silgeTidytextTextMining2016,
  title = {Tidytext: {{Text}} Mining and Analysis Using Tidy Data Principles in {{R}}},
  author = {Silge, Julia and Robinson, David},
  year = {2016},
  journal = {JOSS},
  volume = {1},
  number = {3},
  publisher = {The Open Journal},
  doi = {10.21105/joss.00037}
}

@article{simmonsFalsePositivePsychologyUndisclosed2011,
  title = {False-{{Positive Psychology}}: {{Undisclosed Flexibility}} in {{Data Collection}} and {{Analysis Allows Presenting Anything}} as {{Significant}}},
  shorttitle = {False-{{Positive Psychology}}},
  author = {Simmons, Joseph P. and Nelson, Leif D. and Simonsohn, Uri},
  year = {2011},
  month = nov,
  journal = {Psychological Science},
  volume = {22},
  number = {11},
  pages = {1359--1366},
  issn = {0956-7976, 1467-9280},
  doi = {10.1177/0956797611417632},
  urldate = {2025-09-07},
  abstract = {In this article, we accomplish two things. First, we show that despite empirical psychologists' nominal endorsement of a low rate of false-positive findings ({$\leq$} .05), flexibility in data collection, analysis, and reporting dramatically increases actual false-positive rates. In many cases, a researcher is more likely to falsely find evidence that an effect exists than to correctly find evidence that it does not. We present computer simulations and a pair of actual experiments that demonstrate how unacceptably easy it is to accumulate (and report) statistically significant evidence for a false hypothesis. Second, we suggest a simple, low-cost, and straightforwardly effective disclosure-based solution to this problem. The solution involves six concrete requirements for authors and four guidelines for reviewers, all of which impose a minimal burden on the publication process.},
  langid = {english}
}

@article{singhExploringPublicationMetadata2025,
  title = {Exploring the {{Publication Metadata Fields}} in {{Web}} of {{Science}}, {{Scopus}} and {{Dimensions}}: {{Possibilities}} and {{Ease}} of Doing {{Scientometric Analysis}}},
  shorttitle = {Exploring the {{Publication Metadata Fields}} in {{Web}} of {{Science}}, {{Scopus}} and {{Dimensions}}},
  author = {Singh, Prashasti and Singh, Vivek Kumar and Kanaujia, Anurag},
  year = {2025},
  month = may,
  journal = {Journal of Scientometric Research},
  volume = {13},
  number = {3},
  pages = {715--731},
  doi = {10.5530/jscires.20041144},
  urldate = {2025-07-14},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/EM5MHQLV/Singh et al. - 2025 - Exploring the Publication Metadata Fields in Web of Science, Scopus and Dimensions Possibilities an.pdf}
}

@manual{sjobergCardxExtraAnalysis2025,
  type = {Manual},
  title = {Cardx: {{Extra}} Analysis Results Data Utilities},
  author = {Sjoberg, Daniel D. and Yogasekaram, Abinaya and {de la Rua}, Emily},
  year = {2025}
}

@article{slibarImportanceOpenData2021,
  title = {Importance of the {{Open Data Assessment}}: {{An Insight Into}} the ({{Meta}}) {{Data Quality Dimensions}}},
  shorttitle = {Importance of the {{Open Data Assessment}}},
  author = {{\v S}libar, Barbara and Ore{\v s}ki, Dijana and Begi{\v c}evi{\'c} Re{\dj}ep, Nina},
  year = {2021},
  month = apr,
  journal = {SAGE Open},
  volume = {11},
  number = {2},
  pages = {21582440211023178},
  publisher = {SAGE Publications},
  issn = {2158-2440},
  doi = {10.1177/21582440211023178},
  urldate = {2025-07-16},
  abstract = {Data are the most important resource of the 21st century. The open data (OD) movement provides publicly available data for the development of a knowledge-based society. As such, the concept of OD is a valuable information technology (IT) tool for economic, social, and human development, which adds value. To further develop these processes on a global scale, users need to manage the quality of OD in their practices. Otherwise, what is the point of using data just for the sake of using it (in science or practice) without thinking about data compliance with norms, standards, and so forth? This article aims to provide an overview of (meta)data quality dimensions, sub-dimensions, and metrics used within OD assessment-related research papers. To achieve this, the authors performed a systematic literature review (SLR) and extracted data from 86 relevant studies dealing with the evaluation of OD. The article endows the progress made so far in OD assessment research. Findings of reviewing the assessment of the OD in the light of existing (meta)data quality dimensions unveil the potential of metadata. Furthermore, the analysis disclosed the need for greater use of quantitative methods in research, and metadata can greatly assist in this.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/WLWTHMG6/Šlibar et al. - 2021 - Importance of the Open Data Assessment An Insight Into the (Meta) Data Quality Dimensions.pdf}
}

@article{smaldinoOpenScienceModified2019,
  title = {Open Science and Modified Funding Lotteries Can Impede the Natural Selection of Bad Science},
  author = {Smaldino, Paul E. and Turner, Matthew A. and Contreras Kallens, Pablo A.},
  year = {2019},
  month = jul,
  journal = {Royal Society Open Science},
  volume = {6},
  number = {7},
  pages = {190194},
  publisher = {Royal Society},
  doi = {10.1098/rsos.190194},
  urldate = {2024-12-13},
  abstract = {Assessing scientists using exploitable metrics can lead to the degradation of research methods even without any strategic behaviour on the part of individuals, via `the natural selection of bad science.' Institutional incentives to maximize metrics like publication quantity and impact drive this dynamic. Removing these incentives is necessary, but institutional change is slow. However, recent developments suggest possible solutions with more rapid onsets. These include what we call open science improvements, which can reduce publication bias and improve the efficacy of peer review. In addition, there have been increasing calls for funders to move away from prestige- or innovation-based approaches in favour of lotteries. We investigated whether such changes are likely to improve the reproducibility of science even in the presence of persistent incentives for publication quantity through computational modelling. We found that modified lotteries, which allocate funding randomly among proposals that pass a threshold for methodological rigour, effectively reduce the rate of false discoveries, particularly when paired with open science improvements that increase the publication of negative results and improve the quality of peer review. In the absence of funding that targets rigour, open science improvements can still reduce false discoveries in the published literature but are less likely to improve the overall culture of research practices that underlie those publications.},
  keywords = {cultural evolution,funding,open science,replication,reproducibility},
  file = {/home/michaelb/Zotero/storage/RUTXYEJ7/Smaldino et al. - 2019 - Open science and modified funding lotteries can impede the natural selection of bad science.pdf}
}

@article{smithBridgingEmpiricalGap2013,
  title = {Bridging the {{Empirical Gap}}: {{New Insights}} into the {{Experience}} of {{Multiple Legal Problems}} and {{Advice Seeking}}},
  shorttitle = {Bridging the {{Empirical Gap}}},
  author = {Smith, Marisol and Buck, Alexy and Sidaway, Judith and Scanlan, Lesley},
  year = {2013},
  journal = {Journal of Empirical Legal Studies},
  volume = {10},
  number = {1},
  pages = {146--170},
  issn = {1740-1461},
  doi = {10.1111/jels.12006},
  urldate = {2025-07-25},
  abstract = {There is a substantial body of quantitative evidence that documents the incidence of legal problem clusters, the tendency of problems to occur together. It has also been shown that some people are at greater risk of multiple problem experience than others, in particular, disadvantaged groups. Various policy initiatives, most recently in England and Wales, have been implemented to address the links between civil legal problems. However, to date there has been little empirical research on how clients present with clusters and the success of legal advisors in detecting multiple problems, including the barriers and facilitators that might be relevant. This article presents findings from an extensive empirical study on Community Legal Advice Centres, which were introduced in England and Wales to deliver integrated advice provision. The data are drawn from a triangulated qualitative study comprising advice session observations, and first and follow-up interviews with clients and advisors. The data confirm the existence of problem clusters, but provide a new dimension to research on problem clusters by demonstrating in detail how and why multiple problems are difficult to detect. This systematic insight offers important lessons for policy and service developments that target vulnerable groups with multiple problems.},
  copyright = {{\copyright} 2013, Copyright the Authors. Journal compilation {\copyright} 2013, Cornell Law School and Wiley Periodicals, Inc},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/BPIIDM5M/Smith et al. - 2013 - Bridging the Empirical Gap New Insights into the Experience of Multiple Legal Problems and Advice S.pdf;/home/michaelb/Zotero/storage/LR325653/jels.html}
}

@misc{snoekPracticalBayesianOptimization2012,
  title = {Practical {{Bayesian Optimization}} of {{Machine Learning Algorithms}}},
  author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P.},
  year = {2012},
  month = aug,
  number = {arXiv:1206.2944},
  eprint = {1206.2944},
  primaryclass = {stat},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1206.2944},
  urldate = {2025-07-26},
  abstract = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/michaelb/Zotero/storage/8UFXR6BG/Snoek et al. - 2012 - Practical Bayesian Optimization of Machine Learning Algorithms.pdf;/home/michaelb/Zotero/storage/5V5GAV3U/1206.html}
}

@book{SocietyInternetHow2019,
  title = {Society and the {{Internet}}: {{How Networks}} of {{Information}} and {{Communication}} Are {{Changing Our Lives}}},
  shorttitle = {Society and the {{Internet}}},
  year = {2019},
  month = jul,
  publisher = {Oxford University Press},
  doi = {10.1093/oso/9780198843498.001.0001},
  urldate = {2024-03-11},
  abstract = {Abstract. How is society being reshaped by the continued diffusion and increasing centrality of the Internet in everyday life and work? Society and the Internet},
  isbn = {978-0-19-187932-6},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/UJ6PRV6G/35088.html}
}

@article{sonningReplicationCrisisScientific2021,
  title = {The Replication Crisis, Scientific Revolutions, and Linguistics},
  author = {S{\"o}nning, Lukas and Werner, Valentin},
  year = {2021},
  month = sep,
  journal = {Linguistics},
  volume = {59},
  number = {5},
  pages = {1179--1206},
  publisher = {De Gruyter Mouton},
  issn = {1613-396X},
  doi = {10.1515/ling-2019-0045},
  urldate = {2025-08-23},
  abstract = {Article The replication crisis, scientific revolutions, and linguistics was published on September 1, 2021 in the journal Linguistics (volume 59, issue 5).},
  copyright = {De Gruyter expressly reserves the right to use all content for commercial text and data mining within the meaning of Section 44b of the German Copyright Act.},
  langid = {english},
  keywords = {methodology,paradigm shift,philosophy of science,quantitative linguistics,replication crisis,scientific revolution},
  file = {/home/michaelb/Zotero/storage/F7AHWQ35/Sönning and Werner - 2021 - The replication crisis, scientific revolutions, and linguistics.pdf}
}

@misc{StatberkeleyeduStarkSticiGui,
  title = {Stat.Berkeley.Edu/{\textasciitilde}stark/{{SticiGui}}/{{Text}}/Gloss.Htm?Utm\_source=chatgpt.Com},
  urldate = {2025-07-22},
  howpublished = {https://www.stat.berkeley.edu/{\textasciitilde}stark/SticiGui/Text/gloss.htm?utm\_source=chatgpt.com},
  file = {/home/michaelb/Zotero/storage/A2JVY2D3/gloss.html}
}

@misc{SubmissionGuidelinesScientific,
  title = {Submission Guidelines {\textbar} {{Scientific Reports}}},
  issn = {2045-2322},
  urldate = {2025-08-08},
  abstract = {Submission guidelines},
  copyright = {{\copyright} 2025 Springer Nature Limited},
  howpublished = {https://www.nature.com/srep/author-instructions/submission-guidelines},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/7FLJC4AQ/submission-guidelines.html}
}

@inproceedings{talburtFleschIndexEasily1986,
  title = {The {{Flesch}} Index: {{An}} Easily Programmable Readability Analysis Algorithm},
  shorttitle = {The {{Flesch}} Index},
  booktitle = {Proceedings of the 4th Annual International Conference on {{Systems}} Documentation},
  author = {Talburt, John},
  year = {1986},
  month = feb,
  series = {{{SIGDOC}} '85},
  pages = {114--122},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/10563.10583},
  urldate = {2025-08-21},
  abstract = {This paper is an exposition of an algorithm for text analysis that can be of value to writers and documentalists. The simplicity of this algorithm allows it to be easily programmed on most computer systems. The author has successfully implemented this test as a function within a text editing system written in RPG II. Included in this paper is a sample program written for the VAX 11/780 in PL/I.In 1949 Dr. Rudolph Flesch published a book titled ``The Art of Readable Writing.'' In this book, he described a manual method of reading ease analysis. This method was to analyze text samples of about 100 words. Each sample is assigned a readability index based upon the average number of syllables per word and the average number of words per sentence. This Flesch Index is designed so that most scores range from 0 to 100. Only college graduates are supposed to follow prose in the 0 - 30 range. Scores of 50 -60 are high-school level and 90 - 100 should be readable by fourth graders.Though crude, since it is designed simply to reward short words and sentences, the index is useful. It gives a basic, objective idea of how hard prose is to wade through. This test has been used by some state insurance commissions to enforce the readability of policies.Flesch's algorithm was automated in the early 1970s by the Service Research Group of the General Motors Corporation. The program, called GM-STAR (General Motors Simple Test Approach for Readability) was used so that shop manuals could be made more readable. GM-STAR was originally written in BASIC language. The key to this program is a very simple algorithm to count the number of syllables in a word. In general the text analysis portion of the program uses the following rules: Periods, explanation points, question marks, colons and semi-colons count as end-of-sentence marks.Each group of continuous non-blank characters counts as a word.Each vowel (a, e, i, o, u, y) in a word counts as one syllable subject to the following sub-rules: Ignore final -ES, -ED, -E (except for -LE)Words of three letters or less count as one syllableConsecutive vowels count as one syllable.Although there are many exceptions to these rules, it works in a remarkable number of cases.The Flesch Index (F) for a given text sample is calculated from three statistics;according to the following formula: F = 206.835 - 1.015 {\texttimes} (W/N) - 84.6 \&amp;times (L/W).The Grade Level Equivalent (G) of the Flesch Index is given by the following table:A PL/I program that implements this algorithm is listed below along with sample output. For simplicity, this program assumes all letters are in upper case. Processing text with lower case letters can be accomplished by either modifying the program to test for lower case as well as upper case, or by preprocessing the text sample to translate all letters to upper case. There are a multitude of other refinements and amenities that can be added to the basic analysis. Among these are: Nothing which characters are considered sentence terminators.Ignoring periods that are used for abbreviations rather than sentence terminators.Ignoring word connecting hyphens in compound words.Noting which character groups should probably be spelled out, such as numerals and dollar amounts.Sharpening the syllable counting routine to detect exceptional cases.},
  isbn = {978-0-89791-186-3},
  file = {/home/michaelb/Zotero/storage/GSXZRJCF/Talburt - 1986 - The Flesch index An easily programmable readability analysis algorithm.pdf}
}

@article{tennantAcademicEconomicSocietal2016,
  title = {The Academic, Economic and Societal Impacts of {{Open Access}}: An Evidence-Based Review},
  shorttitle = {The Academic, Economic and Societal Impacts of {{Open Access}}},
  author = {Tennant, Jonathan P. and Waldner, Fran{\c c}ois and Jacques, Damien C. and Masuzzo, Paola and Collister, Lauren B. and Hartgerink, {\relax Chris}. H. J.},
  year = {2016},
  month = sep,
  journal = {F1000Research},
  volume = {5},
  pages = {632},
  issn = {2046-1402},
  doi = {10.12688/f1000research.8460.3},
  urldate = {2025-08-28},
  abstract = {Ongoing debates surrounding Open Access to the scholarly literature are multifaceted and complicated by disparate and often polarised viewpoints from engaged stakeholders. At the current stage, Open Access has become such a global issue that it is critical for all involved in scholarly publishing, including policymakers, publishers, research funders, governments, learned societies, librarians, and academic communities, to be well-informed on the history, benefits, and pitfalls of Open Access. In spite of this, there is a general lack of consensus regarding the potential pros and cons of Open Access at multiple levels. This review aims to be a resource for current knowledge on the impacts of Open Access by synthesizing important research in three major areas: academic, economic and societal. While there is clearly much scope for additional research, several key trends are identified, including a broad citation advantage for researchers who publish openly, as well as additional benefits to the non-academic dissemination of their work. The economic impact of Open Access is less well-understood, although it is clear that access to the research literature is key for innovative enterprises, and a range of governmental and non-governmental services. Furthermore, Open Access has the potential to save both publishers and research funders considerable amounts of financial resources, and can provide some economic benefits to traditionally subscription-based journals. The societal impact of Open Access is strong, in particular for advancing citizen science initiatives, and leveling the playing field for researchers in developing countries. Open Access supersedes all potential alternative modes of access to the scholarly literature through enabling unrestricted re-use, and long-term stability independent of financial constraints of traditional publishers that impede knowledge sharing. However, Open Access has the potential to become unsustainable for research communities if high-cost options are allowed to continue to prevail in a widely unregulated scholarly publishing market. Open Access remains only one of the multiple challenges that the scholarly publishing system is currently facing. Yet, it provides one foundation for increasing engagement with researchers regarding ethical standards of publishing and the broader implications of 'Open Research'.},
  pmcid = {PMC4837983},
  pmid = {27158456},
  file = {/home/michaelb/Zotero/storage/WUE4PMYT/Tennant et al. - 2016 - The academic, economic and societal impacts of Open Access an evidence-based review.pdf}
}

@article{terraccianoPregnancyMarriageQuitting2014,
  title = {Pregnancy, {{Marriage}}, and {{Quitting School Nurture Personality Development}}? {{Commentary}} on {{Bleidorn}}, {{Klimstra}}, {{Denissen}}, {{Rentfrow}}, {{Potter}}, and {{Gosling}} (2013)},
  shorttitle = {Pregnancy, {{Marriage}}, and {{Quitting School Nurture Personality Development}}?},
  author = {Terracciano, Antonio},
  year = {2014},
  month = apr,
  journal = {Psychological Science},
  volume = {25},
  number = {4},
  pages = {1049--1050},
  publisher = {SAGE Publications Inc},
  issn = {0956-7976},
  doi = {10.1177/0956797613515003},
  urldate = {2025-07-26},
  langid = {english}
}

@misc{TextUndData,
  title = {Text Und {{Data Mining}} F{\"u}r {{Zwecke}} Der Wissenschaftlichen {{Forschung}}},
  number = {{\S} 60d},
  chapter = {Teil 1 - Urheberrecht ({\S}{\S} 1 - 69g), Abschnitt 6 - Schranken des Urheberrechts durch gesetzlich erlaubte Nutzungen ({\S}{\S} 44a - 63a) , Unterabschnitt 4 - Gesetzlich erlaubte Nutzungen f{\"u}r Unterricht, Wissenschaft und Institutionen ({\S}{\S} 60a - 60h)}
}

@misc{TextUndDataa,
  title = {Text Und {{Data Mining}} F{\"u}r {{Zwecke}} Der Wissenschaftlichen {{Forschung}}},
  chapter = {Teil 1 - Urheberrecht ({\S}{\S} 1 - 69g), Abschnitt 6 - Schranken des Urheberrechts durch gesetzlich erlaubte Nutzungen ({\S}{\S} 44a - 63a) , Unterabschnitt 4 - Gesetzlich erlaubte Nutzungen f{\"u}r Unterricht, Wissenschaft und Institutionen ({\S}{\S} 60a - 60h)},
  file = {/home/michaelb/Zotero/storage/QX22G7KX/60d.html}
}

@unpublished{thagardInternetEpistemologyContributions1997,
  title = {Internet {{Epistemology}}: {{Contributions}} of {{New Information Technologies}} to {{Scientific Research}}},
  shorttitle = {Internet {{Epistemology}}},
  author = {Thagard, P.},
  year = {1997},
  file = {/home/michaelb/Zotero/storage/5JFRFHZN/THAIEC.html}
}

@manual{therneauRpartRecursivePartitioning2025,
  type = {Manual},
  title = {Rpart: {{Recursive}} Partitioning and Regression Trees},
  author = {Therneau, Terry and Atkinson, Beth},
  year = {2025},
  doi = {10.32614/CRAN.package.rpart}
}

@article{thibaultReflectionsPreregistrationCore2024,
  title = {Reflections on {{Preregistration}}: {{Core Criteria}}, {{Badges}}, {{Complementary Workflows}}},
  shorttitle = {Reflections on {{Preregistration}}},
  author = {Thibault, Robert T. and Pennington, Charlotte R. and Munaf{\`o}, Marcus R.},
  year = {2024},
  month = may,
  journal = {Journal of Trial \& Error},
  volume = {4},
  number = {1},
  publisher = {JOTE Publishers},
  issn = {2667-1204,},
  doi = {10.36850/mr6},
  urldate = {2024-11-06},
  abstract = {Clinical trials are routinely preregistered. In psychology and the social sciences, however, only a small percentage of studies are preregistered, and those preregistrations often contain ambiguities. As advocates strive for broader uptake and effective use of preregistration, they can benefit from drawing on the experience of preregistration in clinical trials and adapting some of those successes to the psychology and social sciences context. We recommend that individuals and organizations who promote preregistration: (1) Establish core preregistration criteria required to consider a preregistration complete; (2) Award preregistered badges only to articles that meet the badge criteria; and (3) Leverage complementary workflows that provide a similar function as preregistration.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/DWV997ND/Thibault et al. - 2024 - Reflections on Preregistration Core Criteria, Badges, Complementary Workflows.pdf}
}

@article{tibshiraniRegressionShrinkageSelection1996,
  title = {Regression {{Shrinkage}} and {{Selection}} via the {{Lasso}}},
  author = {Tibshirani, Robert},
  year = {1996},
  journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
  volume = {58},
  number = {1},
  eprint = {2346178},
  eprinttype = {jstor},
  pages = {267--288},
  publisher = {[Royal Statistical Society, Oxford University Press]},
  issn = {0035-9246},
  urldate = {2025-08-01},
  abstract = {We propose a new method for estimation in linear models. The `lasso' minimizes the residual sum of squares subject to the sum of the absolute value of the coefficients being less than a constant. Because of the nature of this constraint it tends to produce some coefficients that are exactly 0 and hence gives interpretable models. Our simulation studies suggest that the lasso enjoys some of the favourable properties of both subset selection and ridge regression. It produces interpretable models like subset selection and exhibits the stability of ridge regression. There is also an interesting relationship with recent work in adaptive function estimation by Donoho and Johnstone. The lasso idea is quite general and can be applied in a variety of statistical models: extensions to generalized regression models and tree-based models are briefly described.},
  file = {/home/michaelb/Zotero/storage/M3RRYWTY/Tibshirani - 1996 - Regression Shrinkage and Selection via the Lasso.pdf}
}

@misc{tierneyQuartoScientists2025,
  title = {Quarto for {{Scientists}}},
  author = {Tierney, Nicholas},
  year = {2025},
  month = apr,
  urldate = {2025-07-31},
  abstract = {For a scientific report to be completely credible, it must be reproducible. The full computational environment used to derive the results, including the data and code used for statistical analysis should be available for others to reproduce. Quarto is a tool that allows you integrate your code, text and figures in a single file in order to make high quality, reproducible reports. A paper published with an included quarto file and data sets can be reproduced by anyone with a computer.},
  howpublished = {https://qmd4sci.njtierney.com/},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/D8WR8BJF/qmd4sci.njtierney.com.html}
}

@book{tjadenIntroSocialScientists,
  title = {Intro to {{R}} for {{Social Scientists}}},
  author = {Tjaden, Jasper Dag},
  urldate = {2025-08-20},
  abstract = {This course offers an accessible and easy introduction to one of the fastest growing statistical packages used in social science and data science more generally.},
  file = {/home/michaelb/Zotero/storage/PD94USYX/course-intro2r.html}
}

@manual{tobinGgthemrThemesGgplot22020,
  type = {Manual},
  title = {Ggthemr: {{Themes}} for 'Ggplot2'},
  author = {Tobin, Ciaran},
  year = {2020}
}

@misc{TranslatorsZoteroDocumentation,
  title = {Translators [{{Zotero Documentation}}]},
  urldate = {2025-04-23},
  howpublished = {https://www.zotero.org/support/translators},
  file = {/home/michaelb/Zotero/storage/XB4R22PM/translators.html}
}

@article{tripathyClassificationSentimentReviews2016,
  title = {Classification of Sentiment Reviews Using N-Gram Machine Learning Approach},
  author = {Tripathy, Abinash and Agrawal, Ankit and Rath, Santanu Kumar},
  year = {2016},
  month = sep,
  journal = {Expert Systems with Applications},
  volume = {57},
  pages = {117--126},
  issn = {0957-4174},
  doi = {10.1016/j.eswa.2016.03.028},
  urldate = {2025-07-25},
  abstract = {With the ever increasing social networking and online marketing sites, the reviews and blogs obtained from those, act as an important source for further analysis and improved decision making. These reviews are mostly unstructured by nature and thus, need processing like classification or clustering to provide a meaningful information for future uses. These reviews and blogs may be classified into different polarity groups such as positive, negative, and neutral in order to extract information from the input dataset. Supervised machine learning methods help to classify these reviews. In this paper, four different machine learning algorithms such as Naive Bayes (NB), Maximum Entropy (ME), Stochastic Gradient Descent (SGD), and Support Vector Machine (SVM) have been considered for classification of human sentiments. The accuracy of different methods are critically examined in order to access their performance on the basis of parameters such as precision, recall, f-measure, and accuracy.},
  keywords = {IMDb dataset,Maximum Entropy (ME),N-gram,Naive Bayes (NB),Sentiment analysis,Stochastic Gradient Descent (SGD),Support Vector Machine (SVM)},
  file = {/home/michaelb/Zotero/storage/VNKFBMGQ/S095741741630118X.html}
}

@article{ulrichUnderstandingNatureMetadata2022,
  title = {Understanding the {{Nature}} of {{Metadata}}: {{Systematic Review}}},
  shorttitle = {Understanding the {{Nature}} of {{Metadata}}},
  author = {Ulrich, Hannes and {Kock-Schoppenhauer}, Ann-Kristin and Deppenwiese, Noemi and G{\"o}tt, Robert and Kern, Jori and Lablans, Martin and Majeed, Raphael W. and St{\"o}hr, Mark R. and Stausberg, J{\"u}rgen and Varghese, Julian and Dugas, Martin and Ingenerf, Josef},
  year = {2022},
  month = jan,
  journal = {Journal of Medical Internet Research},
  volume = {24},
  number = {1},
  pages = {e25440},
  publisher = {JMIR Publications Inc., Toronto, Canada},
  doi = {10.2196/25440},
  urldate = {2025-07-15},
  abstract = {Background: Metadata are created to describe the corresponding data in a detailed and unambiguous way and is used for various applications in different research areas, for example, data identification and classification. However, a clear definition of metadata is crucial for further use. Unfortunately, extensive experience with the processing and management of metadata has shown that the term ``metadata'' and its use is not always unambiguous. Objective: This study aimed to understand the definition of metadata and the challenges resulting from metadata reuse. Methods: A systematic literature search was performed in this study following the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines for reporting on systematic reviews. Five research questions were identified to streamline the review process, addressing metadata characteristics, metadata standards, use cases, and problems encountered. This review was preceded by a harmonization process to achieve a general understanding of the terms used. Results: The harmonization process resulted in a clear set of definitions for metadata processing focusing on data integration. The following literature review was conducted by 10 reviewers with different backgrounds and using the harmonized definitions. This study included 81 peer-reviewed papers from the last decade after applying various filtering steps to identify the most relevant papers. The 5 research questions could be answered, resulting in a broad overview of the standards, use cases, problems, and corresponding solutions for the application of metadata in different research areas. Conclusions: Metadata can be a powerful tool for identifying, describing, and processing information, but its meaningful creation is costly and challenging. This review process uncovered many standards, use cases, problems, and solutions for dealing with metadata. The presented harmonized definitions and the new schema have the potential to improve the classification and generation of metadata by creating a shared understanding of metadata and its context.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/VQXU58S2/Ulrich et al. - 2022 - Understanding the Nature of Metadata Systematic Review.pdf;/home/michaelb/Zotero/storage/WIZ6R92W/e25440.html}
}

@misc{urhg-60d-tdm,
  title = {{UrhG {\S} 60d: Text und Data Mining f{\"u}r Zwecke der wissenschaftlichen Forschung}},
  shorttitle = {{UrhG {\S} 60d}},
  year = {2021},
  month = jun,
  publisher = {Bundesrepublik Deutschland},
  urldate = {2025-09-08},
  jurisdiction = {DE},
  langid = {ngerman}
}

@article{vabalasMachineLearningAlgorithm2019,
  title = {Machine Learning Algorithm Validation with a Limited Sample Size},
  author = {Vabalas, Andrius and Gowen, Emma and Poliakoff, Ellen and Casson, Alexander J.},
  year = {2019},
  month = nov,
  journal = {PLOS ONE},
  volume = {14},
  number = {11},
  pages = {e0224365},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0224365},
  urldate = {2025-03-28},
  abstract = {Advances in neuroimaging, genomic, motion tracking, eye-tracking and many other technology-based data collection methods have led to a torrent of high dimensional datasets, which commonly have a small number of samples because of the intrinsic high cost of data collection involving human participants. High dimensional data with a small number of samples is of critical importance for identifying biomarkers and conducting feasibility and pilot work, however it can lead to biased machine learning (ML) performance estimates. Our review of studies which have applied ML to predict autistic from non-autistic individuals showed that small sample size is associated with higher reported classification accuracy. Thus, we have investigated whether this bias could be caused by the use of validation methods which do not sufficiently control overfitting. Our simulations show that K-fold Cross-Validation (CV) produces strongly biased performance estimates with small sample sizes, and the bias is still evident with sample size of 1000. Nested CV and train/test split approaches produce robust and unbiased performance estimates regardless of sample size. We also show that feature selection if performed on pooled training and testing data is contributing to bias considerably more than parameter tuning. In addition, the contribution to bias by data dimensionality, hyper-parameter space and number of CV folds was explored, and validation methods were compared with discriminable data. The results suggest how to design robust testing methodologies when working with small datasets and how to interpret the results of other studies based on what validation method was used.},
  langid = {english},
  keywords = {Algorithms,Autism,Gaussian noise,Kernel functions,Learning curves,Machine learning,Neuroimaging,Normal distribution},
  file = {/home/michaelb/Zotero/storage/KNNRIXUD/Vabalas et al. - 2019 - Machine learning algorithm validation with a limited sample size.pdf}
}

@article{vallecamposSerosurveySerologicalSurvey2020,
  title = {Serosurvey: {{Serological}} Survey Analysis for Prevalence Estimation under Misclassification},
  author = {Valle Campos, Andree},
  year = {2020},
  month = oct,
  journal = {Zenodo},
  doi = {10.5281/zenodo.4065080}
}

@article{vandenakkerPreregistrationPracticeComparison2024,
  title = {Preregistration in Practice: {{A}} Comparison of Preregistered and Non-Preregistered Studies in Psychology},
  shorttitle = {Preregistration in Practice},
  author = {{van den Akker}, Olmo R. and {van Assen}, Marcel A. L. M. and Bakker, Marjan and Elsherif, Mahmoud and Wong, Tsz Keung and Wicherts, Jelte M.},
  year = {2024},
  month = sep,
  journal = {Behavior Research Methods},
  volume = {56},
  number = {6},
  pages = {5424--5433},
  issn = {1554-3528},
  doi = {10.3758/s13428-023-02277-0},
  urldate = {2024-10-15},
  abstract = {Preregistration has gained traction as one of the most promising solutions to improve the replicability of scientific effects. In this project, we compared 193 psychology studies that earned a Preregistration Challenge prize or preregistration badge to 193 related studies that were not preregistered. In contrast to our theoretical expectations and prior research, we did not find that preregistered studies had a lower proportion of positive results (Hypothesis 1), smaller effect sizes (Hypothesis 2), or fewer statistical errors (Hypothesis 3) than non-preregistered studies. Supporting our Hypotheses 4 and 5, we found that preregistered studies more often contained power analyses and typically had larger sample sizes than non-preregistered studies. Finally, concerns about the publishability and impact of preregistered studies seem unwarranted, as preregistered studies did not take longer to publish and scored better on several impact measures. Overall, our data indicate that preregistration has beneficial effects in the realm of statistical power and impact, but we did not find robust evidence that preregistration prevents p-hacking and HARKing (Hypothesizing After the Results are Known).},
  langid = {english},
  keywords = {Effect size,HARKing,P-hacking,Positive results,Preregistration,Research impact},
  file = {/home/michaelb/Zotero/storage/8LPRN7WQ/van den Akker et al. - 2024 - Preregistration in practice A comparison of preregistered and non-preregistered studies in psycholo.pdf}
}

@article{varmaBiasErrorEstimation2006,
  title = {Bias in Error Estimation When Using Cross-Validation for Model Selection},
  author = {Varma, Sudhir and Simon, Richard},
  year = {2006},
  month = feb,
  journal = {BMC Bioinformatics},
  volume = {7},
  number = {1},
  pages = {91},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-7-91},
  urldate = {2025-07-16},
  abstract = {Cross-validation (CV) is an effective method for estimating the prediction error of a classifier. Some recent articles have proposed methods for optimizing classifiers by choosing classifier parameter values that minimize the CV error estimate. We have evaluated the validity of using the CV error estimate of the optimized classifier as an estimate of the true error expected on independent data.},
  keywords = {Classifier Parameter,Inherent Bias,Support Vector Machine,Support Vector Machine Classifier,True Error},
  file = {/home/michaelb/Zotero/storage/C89NA9DR/Varma and Simon - 2006 - Bias in error estimation when using cross-validation for model selection.pdf}
}

@manual{vaughanFurrrApplyMapping2022,
  type = {Manual},
  title = {Furrr: {{Apply}} Mapping Functions in Parallel Using Futures},
  author = {Vaughan, Davis and Dancho, Matt},
  year = {2022}
}

@manual{vaughanWorkflowsModelingWorkflows2025,
  type = {Manual},
  title = {Workflows: {{Modeling}} Workflows},
  author = {Vaughan, Davis and Couch, Simon},
  year = {2025}
}

@article{vicente-saezOpenScienceNow2018a,
  title = {Open {{Science}} Now: {{A}} Systematic Literature Review for an Integrated Definition},
  shorttitle = {Open {{Science}} Now},
  author = {{Vicente-Saez}, Ruben and {Martinez-Fuentes}, Clara},
  year = {2018},
  month = jul,
  journal = {Journal of Business Research},
  volume = {88},
  pages = {428--436},
  issn = {0148-2963},
  doi = {10.1016/j.jbusres.2017.12.043},
  urldate = {2025-08-04},
  abstract = {Open Science is a disruptive phenomenon that is emerging around the world and especially in Europe. Open Science brings about socio-cultural and technological change, based on openness and connectivity, on how research is designed, performed, captured, and assessed. Several studies show that there is a lack of awareness about what Open Science is, mainly due to the fact that there is no formal definition of Open Science. The purpose of this paper is to build a rigorous, integrated, and up-to-date definition of the Open Science phenomenon through a systematic literature review. The resulting definition ``Open Science is transparent and accessible knowledge that is shared and developed through collaborative networks'' helps the scientific community, the business world, political actors, and citizens to have a common and clear understanding about what Open Science is, and stimulates an open debate about the social, economic, and human added value of this phenomenon.},
  keywords = {Definition,Open access,Open innovation,Open science,Research and innovation management,Responsible research and innovation},
  file = {/home/michaelb/Zotero/storage/MF9KJHMS/S0148296317305441.html}
}

@article{vieraUnderstandingInterobserverAgreement2005,
  title = {Understanding Interobserver Agreement: The Kappa Statistic},
  shorttitle = {Understanding Interobserver Agreement},
  author = {Viera, Anthony J. and Garrett, Joanne M.},
  year = {2005},
  month = may,
  journal = {Family Medicine},
  volume = {37},
  number = {5},
  pages = {360--363},
  issn = {0742-3225},
  abstract = {Items such as physical exam findings, radiographic interpretations, or other diagnostic tests often rely on some degree of subjective interpretation by observers. Studies that measure the agreement between two or more observers should include a statistic that takes into account the fact that observers will sometimes agree or disagree simply by chance. The kappa statistic (or kappa coefficient) is the most commonly used statistic for this purpose. A kappa of 1 indicates perfect agreement, whereas a kappa of 0 indicates agreement equivalent to chance. A limitation of kappa is that it is affected by the prevalence of the finding under observation. Methods to overcome this limitation have been described.},
  langid = {english},
  pmid = {15883903},
  keywords = {Family Practice,Health Services Research,Models Statistical,Observer Variation,United States}
}

@article{waiteINTERNETKNOWLEDGEEXCHANGE2021,
  title = {{{INTERNET KNOWLEDGE EXCHANGE AND CO-AUTHORSHIP AS FACILITATORS IN SCIENTIFIC RESEARCH}}},
  author = {Waite, Vesna},
  year = {2021},
  month = mar,
  journal = {Journal of Teaching English for Specific and Academic Purposes},
  number = {0},
  pages = {043--050},
  issn = {2334-9212},
  doi = {10.22190/JTESAP2101043W},
  urldate = {2024-12-13},
  abstract = {The aim of this paper is to determine to what extent the use of Internet as a way of acquiring information for research purposes is a successful tool. The Internet can facilitate the research in different ways, some of which are being presented in the paper. Researchers have access to a wide range of databases available on the Internet, also having the opportunity to use sites designed as a social media for academics such as ResearchGate or Academia. Apart from that, there exists some degree of correspondence between open access philosophy and hacker ethics which is being related to academia to point to the possible ethic value researches have towards one another. The paper focuses on advantages of using Internet for the purposes of facilitating research, at the same time introducing the topic of collaboration and co-authorship as vital in today's `publish-or-perish' academia world.},
  copyright = {Copyright (c) 2021 Journal of Teaching English for Specific and Academic Purposes},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/3K78JFBB/Waite - 2021 - INTERNET KNOWLEDGE EXCHANGE AND CO-AUTHORSHIP AS FACILITATORS IN SCIENTIFIC RESEARCH.pdf}
}

@article{wangDDoSAttackProtection2015,
  title = {{{DDoS}} Attack Protection in the Era of Cloud Computing and {{Software-Defined Networking}}},
  author = {Wang, Bing and Zheng, Yao and Lou, Wenjing and Hou, Y. Thomas},
  year = {2015},
  month = apr,
  journal = {Computer Networks},
  volume = {81},
  pages = {308--319},
  issn = {1389-1286},
  doi = {10.1016/j.comnet.2015.02.026},
  urldate = {2024-12-18},
  abstract = {Cloud computing has become the real trend of enterprise IT service model that offers cost-effective and scalable processing. Meanwhile, Software-Defined Networking (SDN) is gaining popularity in enterprise networks for flexibility in network management service and reduced operational cost. There seems a trend for the two technologies to go hand-in-hand in providing an enterprise's IT services. However, the new challenges brought by the marriage of cloud computing and SDN, particularly the implications on enterprise network security, have not been well understood. This paper sets to address this important problem. We start by examining the security impact, in particular, the impact on DDoS attack defense mechanisms, in an enterprise network where both technologies are adopted. We find that SDN technology can actually help enterprises to defend against DDoS attacks if the defense architecture is designed properly. To that end, we propose a DDoS attack mitigation architecture that integrates a highly programmable network monitoring to enable attack detection and a flexible control structure to allow fast and specific attack reaction. To cope with the new architecture, we propose a graphic model based attack detection system that can deal with the dataset shift problem. The simulation results show that our architecture can effectively and efficiently address the security challenges brought by the new network paradigm and our attack detection system can effectively report various attacks using real-world network traffic.},
  keywords = {DDoS mitigation,Graphical model,Software-Defined Networking},
  file = {/home/michaelb/Zotero/storage/ERHMTYCB/S1389128615000742.html}
}

@article{wardenInternetScienceCommunication2010,
  title = {The {{Internet}} and Science Communication: Blurring the Boundaries},
  shorttitle = {The {{Internet}} and Science Communication},
  author = {Warden, R},
  year = {2010},
  month = dec,
  journal = {ecancermedicalscience},
  volume = {4},
  pages = {203},
  issn = {1754-6605},
  doi = {10.3332/ecancer.2010.203},
  urldate = {2024-12-13},
  abstract = {Scientific research is heavily dependent on communication and collaboration. Research does not exist in a bubble; scientific work must be communicated in order to add it to the body of knowledge within a scientific community, so that its members may `stand on the shoulders of giants' and benefit from all that has come before. The effectiveness of scientific communication is crucial to the pace of scientific progress: in all its forms it enables ideas to be formulated, results to be compared, and replications and improvements to be made. The sharing of science is a foundational aspect of the scientific method. This paper, part of the policy research within the FP7 EUROCANCERCOMS project, discusses how the Internet has changed communication by cancer researchers and how it has the potential to change it still more in the future. It will detail two broad types of communication: formal and informal, and how these are changing with the use of new web tools and technologies.},
  pmcid = {PMC3234032},
  pmid = {22276045},
  file = {/home/michaelb/Zotero/storage/6E5I3X22/Warden - 2010 - The Internet and science communication blurring the boundaries.pdf}
}

@manual{waringSkimrCompactFlexible2025,
  type = {Manual},
  title = {Skimr: {{Compact}} and Flexible Summaries of Data},
  author = {Waring, Elin and Quinn, Michael and McNamara, Amelia and {Arino de la Rubia}, Eduardo and Zhu, Hao and Ellis, Shannon},
  year = {2025}
}

@article{wentzVisibilityResearchFUTON2002,
  title = {Visibility of Research: {{FUTON}} Bias},
  shorttitle = {Visibility of Research},
  author = {Wentz, Reinhard},
  year = {2002},
  month = oct,
  journal = {The Lancet},
  volume = {360},
  number = {9341},
  pages = {1256},
  publisher = {Elsevier},
  issn = {0140-6736, 1474-547X},
  doi = {10.1016/S0140-6736(02)11264-5},
  urldate = {2025-08-28},
  langid = {english},
  pmid = {12401287},
  file = {/home/michaelb/Zotero/storage/GVKB79GB/Wentz - 2002 - Visibility of research FUTON bias.pdf}
}

@article{westonRecommendationsIncreasingTransparency2019,
  title = {Recommendations for {{Increasing}} the {{Transparency}} of {{Analysis}} of {{Preexisting Data Sets}}},
  author = {Weston, Sara J. and Ritchie, Stuart J. and Rohrer, Julia M. and Przybylski, Andrew K.},
  year = {2019},
  month = sep,
  journal = {Advances in Methods and Practices in Psychological Science},
  volume = {2},
  number = {3},
  pages = {214--227},
  issn = {2515-2459, 2515-2467},
  doi = {10.1177/2515245919848684},
  urldate = {2025-08-26},
  abstract = {Secondary data analysis, or the analysis of preexisting data, provides a powerful tool for the resourceful psychological scientist. Never has this been more true than now, when technological advances enable both sharing data across labs and continents and mining large sources of preexisting data. However, secondary data analysis is easily overlooked as a key domain for developing new open-science practices or improving analytic methods for robust data analysis. In this article, we provide researchers with the knowledge necessary to incorporate secondary data analysis into their methodological toolbox. We explain that secondary data analysis can be used for either exploratory or confirmatory work, and can be either correlational or experimental, and we highlight the advantages and disadvantages of this type of research. We describe how transparency-enhancing practices can improve and alter interpretations of results from secondary data analysis and discuss approaches that can be used to improve the robustness of reported results. We close by suggesting ways in which scientific subfields and institutions could address and improve the use of secondary data analysis.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/FPXTUBQY/Weston et al. - 2019 - Recommendations for Increasing the Transparency of Analysis of Preexisting Data Sets.pdf}
}

@manual{wickhamEllmerChatLarge2025,
  type = {Manual},
  title = {Ellmer: {{Chat}} with Large Language Models},
  author = {Wickham, Hadley and Cheng, Joe and Jacobs, Aaron and {Aden-Buie}, Garrick},
  year = {2025}
}

@manual{wickhamForcatsToolsWorking2023,
  type = {Manual},
  title = {Forcats: {{Tools}} for Working with Categorical Variables (Factors)},
  author = {Wickham, Hadley},
  year = {2023}
}

@manual{wickhamHttrToolsWorking2023,
  type = {Manual},
  title = {Httr: {{Tools}} for Working with Urls and {{HTTP}}},
  author = {Wickham, Hadley},
  year = {2023}
}

@manual{wickhamPurrrFunctionalProgramming2025,
  type = {Manual},
  title = {Purrr: {{Functional}} Programming Tools},
  author = {Wickham, Hadley and Henry, Lionel},
  year = {2025}
}

@manual{wickhamReadrReadRectangular2024,
  type = {Manual},
  title = {Readr: {{Read}} Rectangular Text Data},
  author = {Wickham, Hadley and Hester, Jim and Bryan, Jennifer},
  year = {2024}
}

@manual{wickhamRvestEasilyHarvest2024,
  type = {Manual},
  title = {Rvest: {{Easily}} Harvest (Scrape) Web Pages},
  author = {Wickham, Hadley},
  year = {2024}
}

@manual{wickhamStringrSimpleConsistent2023,
  type = {Manual},
  title = {Stringr: {{Simple}}, Consistent Wrappers for Common String Operations},
  author = {Wickham, Hadley},
  year = {2023}
}

@article{wickhamWelcomeTidyverse2019,
  title = {Welcome to the {{tidyverse}}},
  author = {Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and Fran{\c c}ois, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and Kuhn, Max and Pedersen, Thomas Lin and Miller, Evan and Bache, Stephan Milton and M{\"u}ller, Kirill and Ooms, Jeroen and Robinson, David and Seidel, Dana Paige and Spinu, Vitalie and Takahashi, Kohske and Vaughan, Davis and Wilke, Claus and Woo, Kara and Yutani, Hiroaki},
  year = {2019},
  journal = {Journal of Open Source Software},
  volume = {4},
  number = {43},
  pages = {1686},
  doi = {10.21105/joss.01686}
}

@manual{wickhamXml2ParseXML2025,
  type = {Manual},
  title = {Xml2: {{Parse XML}}},
  author = {Wickham, Hadley and Hester, Jim and Ooms, Jeroen},
  year = {2025}
}

@manual{wilkeGgridgesRidgelinePlots2024,
  type = {Manual},
  title = {Ggridges: {{Ridgeline}} Plots in 'Ggplot2'},
  author = {Wilke, Claus O.},
  year = {2024}
}

@article{wilkinsonTestingNullHypothesis2013,
  title = {Testing the Null Hypothesis: {{The}} Forgotten Legacy of {{Karl Popper}}?},
  shorttitle = {Testing the Null Hypothesis},
  author = {Wilkinson, Mick},
  year = {2013},
  month = may,
  journal = {Journal of Sports Sciences},
  volume = {31},
  number = {9},
  pages = {919--920},
  publisher = {Routledge},
  issn = {0264-0414},
  doi = {10.1080/02640414.2012.753636},
  urldate = {2024-12-13},
  abstract = {Testing of the null hypothesis is a fundamental aspect of the scientific method and has its basis in the falsification theory of Karl Popper. Null hypothesis testing makes use of deductive reasoning to ensure that the truth of conclusions is irrefutable. In contrast, attempting to demonstrate the new facts on the basis of testing the experimental or research hypothesis makes use of inductive reasoning and is prone to the problem of the Uniformity of Nature assumption described by David Hume in the eighteenth century. Despite this issue and the well documented solution provided by Popper's falsification theory, the majority of publications are still written such that they suggest the research hypothesis is being tested. This is contrary to accepted scientific convention and possibly highlights a poor understanding of the application of conventional significance-based data analysis approaches. Our work should remain driven by conjecture and attempted falsification such that it is always the null hypothesis that is tested. The write up of our studies should make it clear that we are indeed testing the null hypothesis and conforming to the established and accepted philosophical conventions of the scientific method.},
  pmid = {23249368},
  keywords = {philosophy,science,statistics},
  file = {/home/michaelb/Zotero/storage/BYBEMADP/Wilkinson - 2013 - Testing the null hypothesis The forgotten legacy of Karl Popper.pdf}
}

@article{willinskyUnacknowledgedConvergenceOpen2005,
  title = {The Unacknowledged Convergence of Open Source, Open Access, and Open Science},
  author = {Willinsky, John},
  year = {2005},
  month = aug,
  journal = {First Monday},
  issn = {1396-0466},
  doi = {10.5210/fm.v10i8.1265},
  urldate = {2024-12-11},
  abstract = {A number of open initiatives are actively resisting the extension of intellectual property rights. Among these developments, three prominent instances --- open source software, open access to research and scholarship, and open science --- share not only a commitment to the unrestricted exchange of information and ideas, but economic principles based on (1) the efficacy of free software and research; (2) the reputation--building afforded by public access and patronage; and, (3) the emergence of a free--or--subscribe access model. Still, with this much in common, the strong sense of convergence among these open initiatives has yet to be fully realized, to the detriment of the larger, common issue. By drawing on David's (2004; 2003; 2000; 1998) economic work on open science and Weber's (2004) analysis of open source, this paper seeks to make that convergence all the more apparent, as well as worth pursuing, by those interested in furthering this alternative approach, which would treat intellectual properties as public goods.},
  copyright = {Copyright (c)},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/3E4G42JK/Unacknowledged convergence of open source, open access, and open science.pdf}
}

@misc{WorkingMemoryHas,
  title = {Working {{Memory Has Better Fidelity Than Long-Term Memory}}: {{The Fidelity Constraint Is Not}} a {{General Property}} of {{Memory After All}} - {{Natalie Biderman}}, {{Roy Luria}}, {{Andrei R}}. {{Teodorescu}}, {{Ron Hajaj}}, {{Yonatan Goshen-Gottstein}}, 2019},
  urldate = {2025-08-07},
  howpublished = {https://journals.sagepub.com/doi/10.1177/0956797618813538},
  file = {/home/michaelb/Zotero/storage/KX8VFEPK/0956797618813538.html}
}

@article{wrightRangerFastImplementation2017,
  title = {{{ranger}}: A Fast Implementation of Random Forests for High Dimensional Data in {{C}}++ and {{R}}},
  author = {Wright, Marvin N. and Ziegler, Andreas},
  year = {2017},
  journal = {Journal of Statistical Software},
  volume = {77},
  number = {1},
  pages = {1--17},
  doi = {10.18637/jss.v077.i01}
}

@article{xuImpactInternetAccess2021,
  title = {The Impact of Internet Access on Research Output - a Cross-Country Study},
  author = {Xu, Xu and Reed, Markum},
  year = {2021},
  month = sep,
  journal = {Information Economics and Policy},
  volume = {56},
  pages = {100914},
  issn = {0167-6245},
  doi = {10.1016/j.infoecopol.2021.100914},
  urldate = {2024-12-13},
  abstract = {There are large variations in research output among nations despite the rapid globalization progress. This article provides a new angle to help explain such variations. In this article, we study the impact of internet penetration on the research output of an economy. Using a country-level panel dataset, we find that higher internet penetration increases the volume of research output in an economy. The results are robust to a number of specifications, including an instrumental variable approach that addresses the endogeneity of internet penetration. We also find some evidence showing that the impact of internet penetration on research output quantity decreases as the size of fixed broadband users increase in an economy. The effects of internet access on research quality is less conclusive. Results suggest that broadening the access of internet is important for research output boosting or innovation in general.},
  keywords = {Academic productivity,Internet access,Internet penetration,Publication,Research output,Research quality},
  file = {/home/michaelb/Zotero/storage/IJNNM2CJ/S0167624521000020.html}
}

@article{yeoNewFamilyPower2000,
  title = {A New Family of Power Transformations to Improve Normality or Symmetry},
  author = {Yeo, In-Kwon and Johnson, Richard A.},
  year = {2000},
  month = dec,
  journal = {Biometrika},
  volume = {87},
  number = {4},
  pages = {954--959},
  issn = {0006-3444},
  doi = {10.1093/biomet/87.4.954},
  urldate = {2025-08-02},
  abstract = {We introduce a new power transformation family which is well defined on the whole real line and which is appropriate for reducing skewness and to approximate normality. It has properties similar to those of the Box--Cox transformation for positive variables. The large-sample properties of the transformation are investigated in the contect of a single random sample.},
  file = {/home/michaelb/Zotero/storage/PHWARLRW/Yeo and Johnson - 2000 - A new family of power transformations to improve normality or symmetry.pdf;/home/michaelb/Zotero/storage/DTM8FVUK/87.4.html}
}

@inproceedings{zengCBCClusteringBased2003,
  title = {{{CBC}}: Clustering Based Text Classification Requiring Minimal Labeled Data},
  shorttitle = {{{CBC}}},
  booktitle = {Third {{IEEE International Conference}} on {{Data Mining}}},
  author = {Zeng, Hua-Jun and Wang, Xuan-Hui and Chen, Zheng and Lu, Hongjun and Ma, Wei-Ying},
  year = {2003},
  month = nov,
  pages = {443--450},
  doi = {10.1109/ICDM.2003.1250951},
  urldate = {2024-12-16},
  abstract = {Semisupervised learning methods construct classifiers using both labeled and unlabeled training data samples. While unlabeled data samples can help to improve the accuracy of trained models to certain extent, existing methods still face difficulties when labeled data is not sufficient and biased against the underlying data distribution. We present a clustering based classification (CBC) approach. Using this approach, training data, including both the labeled and unlabeled data, is first clustered with the guidance of the labeled data. Some of unlabeled data samples are then labeled based on the clusters obtained. Discriminative classifiers can subsequently be trained with the expanded labeled dataset. The effectiveness of the proposed method is justified analytically. Our experimental results demonstrated that CBC outperforms existing algorithms when the size of labeled dataset is very small.},
  keywords = {Asia,Classification algorithms,Clustering algorithms,Computer science,Semisupervised learning,Supervised learning,Support vector machine classification,Support vector machines,Text categorization,Training data},
  file = {/home/michaelb/Zotero/storage/8FSJCPWF/1250951.html}
}

@article{zenk-moltgenFactorsInfluencingData2018,
  title = {Factors Influencing the Data Sharing Behavior of Researchers in Sociology and Political Science},
  author = {{Zenk-M{\"o}ltgen}, Wolfgang and Akdeniz, Esra and Katsanidou, Alexia and Na{\ss}hoven, Verena and Balaban, Ebru},
  year = {2018},
  month = jun,
  journal = {Journal of Documentation},
  volume = {74},
  number = {5},
  pages = {1053--1073},
  publisher = {Emerald Publishing Limited},
  issn = {0022-0418},
  doi = {10.1108/JD-09-2017-0126},
  urldate = {2024-12-15},
  abstract = {Open data and data sharing should improve transparency of research. The purpose of this paper is to investigate how different institutional and individual factors affect the data sharing behavior of authors of research articles in sociology and political science.,Desktop research analyzed attributes of sociology and political science journals (n=262) from their websites. A second data set of articles (n=1,011; published 2012-2014) was derived from ten of the main journals (five from each discipline) and stated data sharing was examined. A survey of the authors used the Theory of Planned Behavior to examine motivations, behavioral control, and perceived norms for sharing data. Statistical tests (Spearman's {$\rho$}, {$\chi$}2) examined correlations and associations.,Although many journals have a data policy for their authors (78 percent in sociology, 44 percent in political science), only around half of the empirical articles stated that the data were available, and for only 37 percent of the articles could the data be accessed. Journals with higher impact factors, those with a stated data policy, and younger journals were more likely to offer data availability. Of the authors surveyed, 446 responded (44 percent). Statistical analysis indicated that authors' attitudes, reported past behavior, social norms, and perceived behavioral control affected their intentions to share data.,Less than 50 percent of the authors contacted provided responses to the survey. Results indicate that data sharing would improve if journals had explicit data sharing policies but authors also need support from other institutions (their universities, funding councils, and professional associations) to improve data management skills and infrastructures.,This paper builds on previous similar research in sociology and political science and explains some of the barriers to data sharing in social sciences by combining journal policies, published articles, and authors' responses to a survey.},
  langid = {english},
  file = {/home/michaelb/Zotero/storage/2VF37P6B/Zenk-Möltgen et al. - 2018 - Factors influencing the data sharing behavior of researchers in sociology and political science.pdf;/home/michaelb/Zotero/storage/S9XMU592/html.html}
}

@misc{zhaoAdvancingSingleMultitask2024,
  title = {Advancing {{Single-}} and {{Multi-task Text Classification}} through {{Large Language Model Fine-tuning}}},
  author = {Zhao, Hang and Chen, Qile P. and Zhang, Yijing Barry and Yang, Gang},
  year = {2024},
  month = dec,
  number = {arXiv:2412.08587},
  eprint = {2412.08587},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2412.08587},
  urldate = {2025-03-28},
  abstract = {Both encoder-only models (e.g., BERT, RoBERTa) and large language models (LLMs, e.g., Llama3) have been widely used for text classification tasks. However, there is a lack of systematic studies comparing the performance of encoder-based models and LLMs in text classification, particularly when fine-tuning is involved. This study employed a diverse range of models and methods, varying in size and architecture, and including both fine-tuned and pre-trained approaches. We first assessed the performances of these LLMs on the 20 Newsgroups (20NG) and MASSIVE datasets, comparing them to encoder-only RoBERTa models. Additionally, we explored the multi-task capabilities of both model types by combining multiple classification tasks, including intent detection and slot-filling, into a single model using data from both datasets. Our results indicate that fully fine-tuned Llama3-70B models outperform RoBERTa-large and other decoder LLMs across various classification tasks and datasets. Moreover, the consolidated multi-task fine-tuned LLMs matched the performance of dual-model setups in both tasks across both datasets. Overall, our study provides a comprehensive benchmark of encoder-only and LLM models on text classification tasks and demonstrates a method to combine two or more fully fine-tuned decoder LLMs for reduced latency and equivalent performance.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/michaelb/Zotero/storage/SB87ZRSV/Zhao et al. - 2024 - Advancing Single- and Multi-task Text Classification through Large Language Model Fine-tuning.pdf;/home/michaelb/Zotero/storage/EYAJ4ELH/2412.html}
}

@manual{zhuKableExtraConstructComplex2024,
  type = {Manual},
  title = {{{kableExtra}}: {{Construct}} Complex Table with 'kable' and Pipe Syntax},
  author = {Zhu, Hao},
  year = {2024}
}

@inproceedings{zhuRtfidfVarietyTfidf2011,
  title = {R-Tfidf, a {{Variety}} of Tf-Idf {{Term Weighting Strategy}} in {{Document Categorization}}},
  booktitle = {2011 {{Seventh International Conference}} on {{Semantics}}, {{Knowledge}} and {{Grids}}},
  author = {Zhu, Dengya and Xiao, Jitian},
  year = {2011},
  month = oct,
  pages = {83--90},
  doi = {10.1109/SKG.2011.44},
  urldate = {2025-08-01},
  abstract = {Term weighting strategy plays an essential role in the areas related to text processing such as text categorization and information retrieval. In such systems, term frequency, inverse document frequency, and document length normalization are important factors to be considered when a term weighting strategy is developed. Term length normalization is proposed to give equal opportunities to retrieve both lengthy documents and shorter ones. However, terms in very short documents that may be useless for users, especially in the scenario of Web information retrieval, could be assigned very high weights, resulting in a situation where shorter documents are ranked higher than lengthy documents that are more relevant to users information needs. In this research, a new R-tfidf term weighting strategy is proposed to alleviate the side effects of document length normalization. Experimental results demonstrate the proposed approach can to some extent improve the performance of text categorization.},
  keywords = {Frequency estimation,Information retrieval,Probabilistic logic,Support vector machine classification,term-weighting,text categorization,Text categorization,tf-idf,Time frequency analysis,Training},
  file = {/home/michaelb/Zotero/storage/7RBWYJNX/Zhu and Xiao - 2011 - R-tfidf, a Variety of tf-idf Term Weighting Strategy in Document Categorization.pdf}
}

@inproceedings{zuiderwijkNegativeEffectsOpen2014,
  title = {The Negative Effects of Open Government Data - Investigating the Dark Side of Open Data},
  booktitle = {Proceedings of the 15th {{Annual International Conference}} on {{Digital Government Research}}},
  author = {Zuiderwijk, Anneke and Janssen, Marijn},
  year = {2014},
  month = jun,
  series = {Dg.o '14},
  pages = {147--152},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  doi = {10.1145/2612733.2612761},
  urldate = {2025-08-28},
  abstract = {Reports and research appears to assume that the benefits of open data dominate open data's negative consequences. Moreover, much of the existing research discusses benefits and disadvantages on a high level without providing much detailed insight in the underlying processes. Yet many governments are reluctant to open their data, as they are afraid of possible negative consequences of opening data. The objective of this policy paper is to better understand the aspects of the dark side of open data and contributes to the literature by providing a more realistic perspective on open data. We conducted nineteen in depth interviews with public sector officials and data archivists and identified sixteen categories of negative effects. For the dark side inherent to open data efforts the research suggests that a context and dataset dependent decision-making model needs to be made weighing the benefits of open data on the one hand (e.g. creating transparency, the possibility to strengthen economic growth), and the risks and disadvantages of open data (e.g. violating privacy and possible misuse and misinterpretation of data) on the other hand.},
  isbn = {978-1-4503-2901-9},
  file = {/home/michaelb/Zotero/storage/HR6B5PXC/Zuiderwijk and Janssen - 2014 - The negative effects of open government data - investigating the dark side of open data.pdf}
}