adds generation of model-training dataset

This commit is contained in:
Michael Beck 2023-08-14 15:37:30 +02:00
parent 1beff96ae9
commit 90aa58239c

View File

@ -11,6 +11,8 @@ import pandas as pd
import numpy as np import numpy as np
from funs.ClearDupes import deDupe from funs.ClearDupes import deDupe
# Seet for training dataset generation
seed = 86431891
################### ###################
# Setup directories # Setup directories
@ -34,11 +36,13 @@ senDataset = "senators-raw.csv"
# Name of new datafile generated # Name of new datafile generated
senCSVc = "SenatorsTweets-Final" senCSVc = "SenatorsTweets-Final"
senCSVcCov = "SenatorsTweets-OnlyCov" senCSVcCov = "SenatorsTweets-OnlyCov"
senCSVcTrain = "SenatorsTweets-Training"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc + ".csv" senCSVcPath = wd + ud + senCSVc + ".csv"
senCSVcCovPath = wd + ud + senCSVcCov + ".csv" senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
senSAVcPath = wd + ud + senCSV + ".sav" senSAVcPath = wd + ud + senCSV + ".sav"
senDTAcPath = wd + ud + senCSV + ".dta" senDTAcPath = wd + ud + senCSV + ".dta"
senDatasetPath = wd + di + senDataset senDatasetPath = wd + di + senDataset
@ -188,7 +192,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False]
dfCov = dfCov[dfCov['contains_keyword']==True] dfCov = dfCov[dfCov['contains_keyword']==True]
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords']) dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
#%% #%%
# create column with tweet length # create column with tweet length
@ -211,3 +214,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
# ========================= # =========================
# %% # %%
# Create training dataset
np.random.seed(seed);
dfTrain = pd.dfCov(np.random.rand(1800))
# %%
# Create training dataset
np.random.seed(seed);
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
dfTrain['topicCovid'] = True
dfTrain['fake'] = False
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')