adds CleanTweets functions, creates Graphs

This commit is contained in:
Michael Beck
2023-07-07 18:18:51 +02:00
parent 817ec48478
commit 899a99ba72
4 changed files with 201 additions and 42 deletions

View File

@@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023
import pandas as pd
# import pyreadstat
# import numpy as np
import numpy as np
from funs.ClearDupes import deDupe
@@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv"
senDataset = "senators-raw.csv"
# Name of new datafile generated
senCSVc = "Tweets-Cleaned"
senCSVc = "SenatorsTweets-Final"
senCSVcCov = "SenatorsTweets-OnlyCov"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSV + ".csv"
senCSVcPath = wd + ud + senCSVc + ".csv"
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
senSAVcPath = wd + ud + senCSV + ".sav"
senDTAcPath = wd + ud + senCSV + ".dta"
senDatasetPath = wd + di + senDataset
@@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file:
#%%
# overwrite keyword column
df['contains_keyword'] = ''
df['contains_keyword'] = (
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
df['keywords'] = np.nan
df['keywords'] = (
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
)
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
#%%
# create bool contains_keyword
df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
#%%
# recode contains keyword to bool
mask = (df['contains_keyword'] != 'none')
df.loc[mask,'contains_keyword'] = True
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
# create boolean contains_keyword column
df['contains_keyword'] = True
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
#%%
pd.Series(df["user.id"]).is_unique
#%%
@@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username')
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
print(unique_usernames)
# senatorisakson was dropped, is ok
#%%
# create covidtweets csv
dfCov = dfAll[dfAll['contains_keyword']==True]
#%%
# Export to csv, sav and dta
dfAll.to_csv(senCSVcPath, encoding='utf-8')
dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
# =============================================================================
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)