adds generation of model-training dataset

2023-08-14 15:37:30 +02:00 · 2023-08-14 15:37:30 +02:00 · 90aa58239c
commit 90aa58239c
parent 1beff96ae9
1 changed files with 15 additions and 1 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@ -11,6 +11,8 @@ import pandas as pd
 import numpy as np
 from funs.ClearDupes import deDupe
 # Seet for training dataset generation
 seed = 86431891
 ###################
 # Setup directories
@ -34,11 +36,13 @@ senDataset = "senators-raw.csv"
 # Name of new datafile generated
 senCSVc = "SenatorsTweets-Final"
 senCSVcCov = "SenatorsTweets-OnlyCov"
 senCSVcTrain = "SenatorsTweets-Training"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc + ".csv"
 senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
 senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
 senSAVcPath = wd + ud + senCSV + ".sav"
 senDTAcPath = wd + ud + senCSV + ".dta"
 senDatasetPath = wd + di + senDataset
@ -188,7 +192,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False]
 dfCov = dfCov[dfCov['contains_keyword']==True]
 dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
 #%%
 # create column with tweet length
@ -211,3 +214,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
 # =========================
 # %%
 # Create training dataset
 np.random.seed(seed); 
 dfTrain = pd.dfCov(np.random.rand(1800))
 # %%
 # Create training dataset
 np.random.seed(seed); 
 dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
 dfTrain = dfTrain[['tid', 'date', 'rawContent']]
 dfTrain['topicCovid'] = True
 dfTrain['fake'] = False
 dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')