From 90aa58239c441fd769e01b62428c716ede81a195 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Mon, 14 Aug 2023 15:37:30 +0200 Subject: [PATCH] adds generation of model-training dataset --- cleanTweets.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/cleanTweets.py b/cleanTweets.py index deb6e7c..19957db 100644 --- a/cleanTweets.py +++ b/cleanTweets.py @@ -11,6 +11,8 @@ import pandas as pd import numpy as np from funs.ClearDupes import deDupe +# Seet for training dataset generation +seed = 86431891 ################### # Setup directories @@ -34,11 +36,13 @@ senDataset = "senators-raw.csv" # Name of new datafile generated senCSVc = "SenatorsTweets-Final" senCSVcCov = "SenatorsTweets-OnlyCov" +senCSVcTrain = "SenatorsTweets-Training" # don't change this one senCSVPath = wd + ud + senCSV senCSVcPath = wd + ud + senCSVc + ".csv" senCSVcCovPath = wd + ud + senCSVcCov + ".csv" +senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv" senSAVcPath = wd + ud + senCSV + ".sav" senDTAcPath = wd + ud + senCSV + ".dta" senDatasetPath = wd + di + senDataset @@ -188,7 +192,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False] dfCov = dfCov[dfCov['contains_keyword']==True] dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords']) - #%% # create column with tweet length @@ -211,3 +214,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id') # ========================= # %% +# Create training dataset +np.random.seed(seed); +dfTrain = pd.dfCov(np.random.rand(1800)) +# %% +# Create training dataset +np.random.seed(seed); +dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)] +dfTrain = dfTrain[['tid', 'date', 'rawContent']] +dfTrain['topicCovid'] = True +dfTrain['fake'] = False +dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')