adds generation of model-training dataset
This commit is contained in:
parent
1beff96ae9
commit
90aa58239c
@ -11,6 +11,8 @@ import pandas as pd
|
||||
import numpy as np
|
||||
from funs.ClearDupes import deDupe
|
||||
|
||||
# Seet for training dataset generation
|
||||
seed = 86431891
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
@ -34,11 +36,13 @@ senDataset = "senators-raw.csv"
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
senCSVcTrain = "SenatorsTweets-Training"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
@ -188,7 +192,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
||||
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||
|
||||
|
||||
#%%
|
||||
# create column with tweet length
|
||||
|
||||
@ -211,3 +214,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
|
||||
# =========================
|
||||
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = pd.dfCov(np.random.rand(1800))
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
|
||||
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
|
||||
dfTrain['topicCovid'] = True
|
||||
dfTrain['fake'] = False
|
||||
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
|
||||
|
Loading…
x
Reference in New Issue
Block a user