adds CleanTweets functions, creates Graphs

2023-07-07 18:18:51 +02:00
parent 817ec48478
commit 899a99ba72
4 changed files with 201 additions and 42 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023

 import pandas as pd
 # import pyreadstat
-# import numpy as np
+import numpy as np
 from funs.ClearDupes import deDupe


@@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv"
 senDataset = "senators-raw.csv"

 # Name of new datafile generated
-senCSVc = "Tweets-Cleaned"
+senCSVc = "SenatorsTweets-Final"
+senCSVcCov = "SenatorsTweets-OnlyCov"

 # don't change this one
 senCSVPath = wd + ud + senCSV
-senCSVcPath = wd + ud + senCSV + ".csv"
+senCSVcPath = wd + ud + senCSVc + ".csv"
+senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
 senSAVcPath = wd + ud + senCSV + ".sav"
 senDTAcPath = wd + ud + senCSV + ".dta"
 senDatasetPath = wd + di + senDataset
@@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file:

 #%%
 # overwrite keyword column
-df['contains_keyword'] = ''
-df['contains_keyword'] = (
-    df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
+df['keywords'] = np.nan
+df['keywords'] = (
+    df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
 )
-mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
-df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
 #%%
-# create bool contains_keyword
-df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
-#%%
-# recode contains keyword to bool
-mask = (df['contains_keyword'] != 'none')
-df.loc[mask,'contains_keyword'] = True
-df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
+# create boolean contains_keyword column
+df['contains_keyword'] = True
+mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask

+#%%
 pd.Series(df["user.id"]).is_unique

 #%%
@@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username')
 unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
 print(unique_usernames)
 # senatorisakson was dropped, is ok
+#%%
+# create covidtweets csv
+dfCov = dfAll[dfAll['contains_keyword']==True]

 #%%
 # Export to csv, sav and dta
 dfAll.to_csv(senCSVcPath, encoding='utf-8')
+dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
 # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
 # =============================================================================
 # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)