145 lines
4.0 KiB
Python
145 lines
4.0 KiB
Python
#%%
|
|
#!/usr/bin/env python3
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
import string
|
|
#%%
|
|
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Mon Jun 26 20:36:43 2023
|
|
|
|
@author: michael
|
|
"""
|
|
|
|
import pandas as pd
|
|
# import pyreadstat
|
|
# import numpy as np
|
|
|
|
###################
|
|
# Setup directories
|
|
# WD Michael
|
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
# WD Server
|
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
|
|
# datafile input directory
|
|
di = "data/IN/"
|
|
|
|
# Tweet-datafile output directory
|
|
ud = "data/OUT/"
|
|
|
|
# Name of file that all senator data will be written to
|
|
senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
|
|
|
|
# Name of file that all senator data will be written to
|
|
senDataset = "senators-raw.csv"
|
|
|
|
# Name of new datafile generated
|
|
senCSVc = "SenatorsTweets-Final.csv"
|
|
senCSVcCov = "SenatorsTweets-OnlyCov.csv"
|
|
|
|
# Outfiles
|
|
wcAllTweetsF = "graphs/Wordcloud-All.png"
|
|
wcCovTweetsF = "graphs/Wordcloud-Cov.png"
|
|
TwCovTimeline = "graphs/Timeline.png"
|
|
|
|
# don't change this one
|
|
senCSVcPath = wd + ud + senCSVc
|
|
senCSVcCovPath = wd + ud + senCSVcCov
|
|
wcAllTweetsFPath = wd + ud + wcAllTweetsF
|
|
wcCovTweetsFPath = wd + ud + wcCovTweetsF
|
|
TwCovTimelinePath = wd + ud + TwCovTimeline
|
|
|
|
#%%
|
|
df = pd.read_csv(senCSVcPath, dtype=(object))
|
|
dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
|
|
#%%
|
|
df['cleanContent'] = df['rawContent'].apply(remove_URL)
|
|
df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
|
|
df['cleanContent'] = df['cleanContent'].apply(remove_html)
|
|
df['cleanContent'] = df['cleanContent'].apply(remove_punct)
|
|
|
|
# create string with all cleaned tweets as text
|
|
str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
|
#%%
|
|
dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
|
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
|
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
|
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
|
|
|
|
# create string with all cleaned tweets as text
|
|
str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
|
#%%
|
|
# replace single U and S characters
|
|
str_covtweets = str_covtweets.replace(' u ', ' ')
|
|
str_covtweets = str_covtweets.replace(' s ', ' ')
|
|
str_alltweets = str_alltweets.replace(' u ', ' ')
|
|
str_alltweets = str_alltweets.replace(' s ', ' ')
|
|
|
|
|
|
# %%
|
|
# create wordcloud alltweets
|
|
wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
|
wcA.generate(str_alltweets)
|
|
|
|
#%%
|
|
# draw
|
|
plt.figure( figsize=(20,20))
|
|
plt.axis("off")
|
|
plt.imshow(wcA, interpolation="bilinear")
|
|
fig1 = plt.gcf()
|
|
plt.show()
|
|
fig1.savefig(wcAllTweetsFPath)
|
|
|
|
# %%
|
|
# create wordcloud covtweets
|
|
wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
|
wcC.generate(str_covtweets)
|
|
#%%
|
|
# draw
|
|
plt.figure( figsize=(20,20))
|
|
plt.axis("off")
|
|
plt.imshow(wcC, interpolation="bilinear")
|
|
fig2 = plt.gcf()
|
|
plt.show()
|
|
fig2.savefig(wcCovTweetsFPath)
|
|
# %%
|
|
# with open('test.txt', 'w') as f:
|
|
# f.write(str_covtweets)
|
|
# %%
|
|
dfT = pd.DataFrame()
|
|
dfT['date'] = df['date'].copy()
|
|
dfT['count'] = 1
|
|
|
|
dfCovT = pd.DataFrame()
|
|
dfCovT['date'] = dfCov['date'].copy()
|
|
dfCovT['count'] = 1
|
|
#%%
|
|
dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
|
|
dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
|
|
|
|
#%%
|
|
dfT = dfT.groupby('date').count().reset_index()
|
|
dfCovT = dfCovT.groupby('date').count().reset_index()
|
|
|
|
#%%
|
|
import matplotlib.dates as mdates
|
|
# n of tweets overall
|
|
my_dpi=300
|
|
plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
|
|
plt.style.use('seaborn-darkgrid')
|
|
fig, ax = plt.subplots(figsize=(8, 6))
|
|
ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
|
|
ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
|
|
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
|
|
ax.xaxis.set_minor_locator(mdates.MonthLocator())
|
|
fig.autofmt_xdate()
|
|
fig.savefig(TwCovTimelinePath)
|
|
|
|
|
|
# %%
|