adds generation of model-training dataset
This commit is contained in:
parent
1beff96ae9
commit
90aa58239c
@ -11,6 +11,8 @@ import pandas as pd
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from funs.ClearDupes import deDupe
|
from funs.ClearDupes import deDupe
|
||||||
|
|
||||||
|
# Seet for training dataset generation
|
||||||
|
seed = 86431891
|
||||||
|
|
||||||
###################
|
###################
|
||||||
# Setup directories
|
# Setup directories
|
||||||
@ -34,11 +36,13 @@ senDataset = "senators-raw.csv"
|
|||||||
# Name of new datafile generated
|
# Name of new datafile generated
|
||||||
senCSVc = "SenatorsTweets-Final"
|
senCSVc = "SenatorsTweets-Final"
|
||||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||||
|
senCSVcTrain = "SenatorsTweets-Training"
|
||||||
|
|
||||||
# don't change this one
|
# don't change this one
|
||||||
senCSVPath = wd + ud + senCSV
|
senCSVPath = wd + ud + senCSV
|
||||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||||
|
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
|
||||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||||
senDatasetPath = wd + di + senDataset
|
senDatasetPath = wd + di + senDataset
|
||||||
@ -188,7 +192,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
|||||||
dfCov = dfCov[dfCov['contains_keyword']==True]
|
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||||
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# create column with tweet length
|
# create column with tweet length
|
||||||
|
|
||||||
@ -211,3 +214,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
|
|||||||
# =========================
|
# =========================
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
# Create training dataset
|
||||||
|
np.random.seed(seed);
|
||||||
|
dfTrain = pd.dfCov(np.random.rand(1800))
|
||||||
|
# %%
|
||||||
|
# Create training dataset
|
||||||
|
np.random.seed(seed);
|
||||||
|
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
|
||||||
|
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
|
||||||
|
dfTrain['topicCovid'] = True
|
||||||
|
dfTrain['fake'] = False
|
||||||
|
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user