adds CleanTweets functions, creates Graphs
This commit is contained in:
@@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023
|
||||
|
||||
import pandas as pd
|
||||
# import pyreadstat
|
||||
# import numpy as np
|
||||
import numpy as np
|
||||
from funs.ClearDupes import deDupe
|
||||
|
||||
|
||||
@@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "Tweets-Cleaned"
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSV + ".csv"
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
@@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file:
|
||||
|
||||
#%%
|
||||
# overwrite keyword column
|
||||
df['contains_keyword'] = ''
|
||||
df['contains_keyword'] = (
|
||||
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
|
||||
df['keywords'] = np.nan
|
||||
df['keywords'] = (
|
||||
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
|
||||
)
|
||||
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
|
||||
#%%
|
||||
# create bool contains_keyword
|
||||
df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
|
||||
#%%
|
||||
# recode contains keyword to bool
|
||||
mask = (df['contains_keyword'] != 'none')
|
||||
df.loc[mask,'contains_keyword'] = True
|
||||
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
|
||||
# create boolean contains_keyword column
|
||||
df['contains_keyword'] = True
|
||||
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||
|
||||
#%%
|
||||
pd.Series(df["user.id"]).is_unique
|
||||
|
||||
#%%
|
||||
@@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username')
|
||||
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
||||
print(unique_usernames)
|
||||
# senatorisakson was dropped, is ok
|
||||
#%%
|
||||
# create covidtweets csv
|
||||
dfCov = dfAll[dfAll['contains_keyword']==True]
|
||||
|
||||
#%%
|
||||
# Export to csv, sav and dta
|
||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||
dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
|
||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||
# =============================================================================
|
||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||
|
||||
Reference in New Issue
Block a user