#%% #!/usr/bin/env python3 import numpy as np import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct import string #%% # -*- coding: utf-8 -*- """ Created on Mon Jun 26 20:36:43 2023 @author: michael """ import pandas as pd # import pyreadstat # import numpy as np ################### # Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # datafile input directory di = "data/IN/" # Tweet-datafile output directory ud = "data/OUT/" # Name of file that all senator data will be written to senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv # Name of file that all senator data will be written to senDataset = "senators-raw.csv" # Name of new datafile generated senCSVc = "SenatorsTweets-Final.csv" senCSVcCov = "SenatorsTweets-OnlyCov.csv" # Outfiles wcAllTweetsF = "graphs/Wordcloud-All.png" wcCovTweetsF = "graphs/Wordcloud-Cov.png" TwCovTimeline = "graphs/Timeline.png" # don't change this one senCSVcPath = wd + ud + senCSVc senCSVcCovPath = wd + ud + senCSVcCov wcAllTweetsFPath = wd + ud + wcAllTweetsF wcCovTweetsFPath = wd + ud + wcCovTweetsF TwCovTimelinePath = wd + ud + TwCovTimeline #%% df = pd.read_csv(senCSVcPath, dtype=(object)) dfCov = pd.read_csv(senCSVcCovPath, dtype=(object)) #%% df['cleanContent'] = df['rawContent'].apply(remove_URL) df['cleanContent'] = df['cleanContent'].apply(remove_emoji) df['cleanContent'] = df['cleanContent'].apply(remove_html) df['cleanContent'] = df['cleanContent'].apply(remove_punct) # create string with all cleaned tweets as text str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold() #%% dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL) dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji) dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html) dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct) # create string with all cleaned tweets as text str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold() #%% # replace single U and S characters str_covtweets = str_covtweets.replace(' u ', ' ') str_covtweets = str_covtweets.replace(' s ', ' ') str_alltweets = str_alltweets.replace(' u ', ' ') str_alltweets = str_alltweets.replace(' s ', ' ') # %% # create wordcloud alltweets wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True) wcA.generate(str_alltweets) #%% # draw plt.figure( figsize=(20,20)) plt.axis("off") plt.imshow(wcA, interpolation="bilinear") fig1 = plt.gcf() plt.show() fig1.savefig(wcAllTweetsFPath) # %% # create wordcloud covtweets wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True) wcC.generate(str_covtweets) #%% # draw plt.figure( figsize=(20,20)) plt.axis("off") plt.imshow(wcC, interpolation="bilinear") fig2 = plt.gcf() plt.show() fig2.savefig(wcCovTweetsFPath) # %% # with open('test.txt', 'w') as f: # f.write(str_covtweets) # %% dfT = pd.DataFrame() dfT['date'] = df['date'].copy() dfT['count'] = 1 dfCovT = pd.DataFrame() dfCovT['date'] = dfCov['date'].copy() dfCovT['count'] = 1 #%% dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d') dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d') #%% dfT = dfT.groupby('date').count().reset_index() dfCovT = dfCovT.groupby('date').count().reset_index() #%% import matplotlib.dates as mdates # n of tweets overall my_dpi=300 plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi) plt.style.use('seaborn-darkgrid') fig, ax = plt.subplots(figsize=(8, 6)) ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4) ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1) ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3)) ax.xaxis.set_minor_locator(mdates.MonthLocator()) fig.autofmt_xdate() fig.savefig(TwCovTimelinePath) # %%