diff --git a/analyze.py b/analyze.py index 05677a5..5896d0c 100644 --- a/analyze.py +++ b/analyze.py @@ -5,6 +5,7 @@ import pandas as pd from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from datasets import load_dataset from transformers.pipelines.pt_utils import KeyDataset +from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct #%% @@ -82,33 +83,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter- tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert -def remove_URL(text): - url = re.compile(r'https?://\S+|www\.\S+') - return url.sub(r'', text) - - -def remove_emoji(text): - emoji_pattern = re.compile( - '[' - u'\U0001F600-\U0001F64F' # emoticons - u'\U0001F300-\U0001F5FF' # symbols & pictographs - u'\U0001F680-\U0001F6FF' # transport & map symbols - u'\U0001F1E0-\U0001F1FF' # flags (iOS) - u'\U00002702-\U000027B0' - u'\U000024C2-\U0001F251' - ']+', - flags=re.UNICODE) - return emoji_pattern.sub(r'', text) - - -def remove_html(text): - html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') - return re.sub(html, '', text) - - -def remove_punct(text): - table = str.maketrans('', '', string.punctuation) - return text.translate(table) dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL) dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji) diff --git a/cleanTweets.py b/cleanTweets.py index b2351fa..3f3b316 100644 --- a/cleanTweets.py +++ b/cleanTweets.py @@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023 import pandas as pd # import pyreadstat -# import numpy as np +import numpy as np from funs.ClearDupes import deDupe @@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv" senDataset = "senators-raw.csv" # Name of new datafile generated -senCSVc = "Tweets-Cleaned" +senCSVc = "SenatorsTweets-Final" +senCSVcCov = "SenatorsTweets-OnlyCov" # don't change this one senCSVPath = wd + ud + senCSV -senCSVcPath = wd + ud + senCSV + ".csv" +senCSVcPath = wd + ud + senCSVc + ".csv" +senCSVcCovPath = wd + ud + senCSVcCov + ".csv" senSAVcPath = wd + ud + senCSV + ".sav" senDTAcPath = wd + ud + senCSV + ".dta" senDatasetPath = wd + di + senDataset @@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file: #%% # overwrite keyword column -df['contains_keyword'] = '' -df['contains_keyword'] = ( - df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none') +df['keywords'] = np.nan +df['keywords'] = ( + df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) ) -mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none' -df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask #%% -# create bool contains_keyword -df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column -#%% -# recode contains keyword to bool -mask = (df['contains_keyword'] != 'none') -df.loc[mask,'contains_keyword'] = True -df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords +# create boolean contains_keyword column +df['contains_keyword'] = True +mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' +df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask +#%% pd.Series(df["user.id"]).is_unique #%% @@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username') unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique() print(unique_usernames) # senatorisakson was dropped, is ok +#%% +# create covidtweets csv +dfCov = dfAll[dfAll['contains_keyword']==True] #%% # Export to csv, sav and dta dfAll.to_csv(senCSVcPath, encoding='utf-8') +dfCov.to_csv(senCSVcCovPath, encoding='utf-8') # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb # ============================================================================= # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) diff --git a/createGraphs.py b/createGraphs.py new file mode 100644 index 0000000..5cb71dc --- /dev/null +++ b/createGraphs.py @@ -0,0 +1,144 @@ +#%% +#!/usr/bin/env python3 +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from wordcloud import WordCloud +from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct +import string +#%% + +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 26 20:36:43 2023 + +@author: michael +""" + +import pandas as pd +# import pyreadstat +# import numpy as np + +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv + +# Name of file that all senator data will be written to +senDataset = "senators-raw.csv" + +# Name of new datafile generated +senCSVc = "SenatorsTweets-Final.csv" +senCSVcCov = "SenatorsTweets-OnlyCov.csv" + +# Outfiles +wcAllTweetsF = "graphs/Wordcloud-All.png" +wcCovTweetsF = "graphs/Wordcloud-Cov.png" +TwCovTimeline = "graphs/Timeline.png" + +# don't change this one +senCSVcPath = wd + ud + senCSVc +senCSVcCovPath = wd + ud + senCSVcCov +wcAllTweetsFPath = wd + ud + wcAllTweetsF +wcCovTweetsFPath = wd + ud + wcCovTweetsF +TwCovTimelinePath = wd + ud + TwCovTimeline + +#%% +df = pd.read_csv(senCSVcPath, dtype=(object)) +dfCov = pd.read_csv(senCSVcCovPath, dtype=(object)) +#%% +df['cleanContent'] = df['rawContent'].apply(remove_URL) +df['cleanContent'] = df['cleanContent'].apply(remove_emoji) +df['cleanContent'] = df['cleanContent'].apply(remove_html) +df['cleanContent'] = df['cleanContent'].apply(remove_punct) + +# create string with all cleaned tweets as text +str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold() +#%% +dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL) +dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji) +dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html) +dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct) + +# create string with all cleaned tweets as text +str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold() +#%% +# replace single U and S characters +str_covtweets = str_covtweets.replace(' u ', ' ') +str_covtweets = str_covtweets.replace(' s ', ' ') +str_alltweets = str_alltweets.replace(' u ', ' ') +str_alltweets = str_alltweets.replace(' s ', ' ') + + +# %% +# create wordcloud alltweets +wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True) +wcA.generate(str_alltweets) + +#%% +# draw +plt.figure( figsize=(20,20)) +plt.axis("off") +plt.imshow(wcA, interpolation="bilinear") +fig1 = plt.gcf() +plt.show() +fig1.savefig(wcAllTweetsFPath) + +# %% +# create wordcloud covtweets +wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True) +wcC.generate(str_covtweets) +#%% +# draw +plt.figure( figsize=(20,20)) +plt.axis("off") +plt.imshow(wcC, interpolation="bilinear") +fig2 = plt.gcf() +plt.show() +fig2.savefig(wcCovTweetsFPath) +# %% +# with open('test.txt', 'w') as f: +# f.write(str_covtweets) +# %% +dfT = pd.DataFrame() +dfT['date'] = df['date'].copy() +dfT['count'] = 1 + +dfCovT = pd.DataFrame() +dfCovT['date'] = dfCov['date'].copy() +dfCovT['count'] = 1 +#%% +dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d') +dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d') + +#%% +dfT = dfT.groupby('date').count().reset_index() +dfCovT = dfCovT.groupby('date').count().reset_index() + +#%% +import matplotlib.dates as mdates +# n of tweets overall +my_dpi=300 +plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi) +plt.style.use('seaborn-darkgrid') +fig, ax = plt.subplots(figsize=(8, 6)) +ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4) +ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1) +ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3)) +ax.xaxis.set_minor_locator(mdates.MonthLocator()) +fig.autofmt_xdate() +fig.savefig(TwCovTimelinePath) + + +# %% diff --git a/funs/CleanTweets.py b/funs/CleanTweets.py new file mode 100644 index 0000000..bf87e03 --- /dev/null +++ b/funs/CleanTweets.py @@ -0,0 +1,39 @@ +import re +import string + +def remove_URL(text): + url = re.compile(r'https?://\S+|www\.\S+') + return url.sub(r'', text) + + +def remove_emoji(text): + emoji_pattern = re.compile( + '[' + u'\U0001F600-\U0001F64F' # emoticons + u'\U0001F300-\U0001F5FF' # symbols & pictographs + u'\U0001F680-\U0001F6FF' # transport & map symbols + u'\U0001F1E0-\U0001F1FF' # flags (iOS) + u'\U00002702-\U000027B0' + u'\U000024C2-\U0001F251' + ']+', + flags=re.UNICODE) + return emoji_pattern.sub(r'', text) + + +def remove_html(text): + html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') + return re.sub(html, '', text) + + +def remove_punct(text): + table = str.maketrans('', '', string.punctuation) + return text.translate(table) + +def clean_all(text): + if not isinstance(text, str): + text = str(text) # Convert non-string values to string + text = remove_URL(text) + text = remove_emoji(text) + text = remove_html(text) + text = remove_punct(text) + return text