This commit is contained in:
2025-11-29 15:11:13 +01:00
commit 9b3618478f
293 changed files with 113456 additions and 0 deletions
+11
View File
@@ -0,0 +1,11 @@
"","step","before","after","dropped","logic"
"1","deduplicate",95042,55904,39138,"~!duplicated(doi)"
"2","NA published_date",55904,55904,0,"~!is.na(published_date)"
"3","drop published_date > date_from",55904,47309,8595,"~published_date > date_from"
"4","drop published_date < date_to",47309,45922,1387,"~published_date < date_to"
"5","filter_reviews_simple",45922,45557,365,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
"6","filter_corrections",45557,45144,413,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
"7","filter_front_back",45144,42320,2824,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
"8","filter_announcements",42320,42279,41,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
"9","filter_reviews_full",42279,41908,371,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
"10","filter_other",41908,40860,1048,"~!str_detect(title, regex(str_c(kws, collapse = ""|""), ignore_case = TRUE))"
1 step before after dropped logic
2 1 deduplicate 95042 55904 39138 ~!duplicated(doi)
3 2 NA published_date 55904 55904 0 ~!is.na(published_date)
4 3 drop published_date > date_from 55904 47309 8595 ~published_date > date_from
5 4 drop published_date < date_to 47309 45922 1387 ~published_date < date_to
6 5 filter_reviews_simple 45922 45557 365 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))
7 6 filter_corrections 45557 45144 413 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))
8 7 filter_front_back 45144 42320 2824 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))
9 8 filter_announcements 42320 42279 41 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))
10 9 filter_reviews_full 42279 41908 371 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))
11 10 filter_other 41908 40860 1048 ~!str_detect(title, regex(str_c(kws, collapse = "|"), ignore_case = TRUE))