adds CleanTweets functions, creates Graphs
This commit is contained in:
parent
817ec48478
commit
899a99ba72
28
analyze.py
28
analyze.py
@ -5,6 +5,7 @@ import pandas as pd
|
|||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@ -82,33 +83,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-
|
|||||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
def remove_URL(text):
|
|
||||||
url = re.compile(r'https?://\S+|www\.\S+')
|
|
||||||
return url.sub(r'', text)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_emoji(text):
|
|
||||||
emoji_pattern = re.compile(
|
|
||||||
'['
|
|
||||||
u'\U0001F600-\U0001F64F' # emoticons
|
|
||||||
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
|
||||||
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
|
||||||
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
|
||||||
u'\U00002702-\U000027B0'
|
|
||||||
u'\U000024C2-\U0001F251'
|
|
||||||
']+',
|
|
||||||
flags=re.UNICODE)
|
|
||||||
return emoji_pattern.sub(r'', text)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_html(text):
|
|
||||||
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
|
||||||
return re.sub(html, '', text)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_punct(text):
|
|
||||||
table = str.maketrans('', '', string.punctuation)
|
|
||||||
return text.translate(table)
|
|
||||||
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
||||||
|
@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
# import pyreadstat
|
# import pyreadstat
|
||||||
# import numpy as np
|
import numpy as np
|
||||||
from funs.ClearDupes import deDupe
|
from funs.ClearDupes import deDupe
|
||||||
|
|
||||||
|
|
||||||
@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv"
|
|||||||
senDataset = "senators-raw.csv"
|
senDataset = "senators-raw.csv"
|
||||||
|
|
||||||
# Name of new datafile generated
|
# Name of new datafile generated
|
||||||
senCSVc = "Tweets-Cleaned"
|
senCSVc = "SenatorsTweets-Final"
|
||||||
|
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||||
|
|
||||||
# don't change this one
|
# don't change this one
|
||||||
senCSVPath = wd + ud + senCSV
|
senCSVPath = wd + ud + senCSV
|
||||||
senCSVcPath = wd + ud + senCSV + ".csv"
|
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||||
|
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||||
senDatasetPath = wd + di + senDataset
|
senDatasetPath = wd + di + senDataset
|
||||||
@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file:
|
|||||||
|
|
||||||
#%%
|
#%%
|
||||||
# overwrite keyword column
|
# overwrite keyword column
|
||||||
df['contains_keyword'] = ''
|
df['keywords'] = np.nan
|
||||||
df['contains_keyword'] = (
|
df['keywords'] = (
|
||||||
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
|
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
|
||||||
)
|
)
|
||||||
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
|
|
||||||
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
|
|
||||||
#%%
|
#%%
|
||||||
# create bool contains_keyword
|
# create boolean contains_keyword column
|
||||||
df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
|
df['contains_keyword'] = True
|
||||||
#%%
|
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||||
# recode contains keyword to bool
|
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||||
mask = (df['contains_keyword'] != 'none')
|
|
||||||
df.loc[mask,'contains_keyword'] = True
|
|
||||||
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
|
|
||||||
|
|
||||||
|
#%%
|
||||||
pd.Series(df["user.id"]).is_unique
|
pd.Series(df["user.id"]).is_unique
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username')
|
|||||||
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
||||||
print(unique_usernames)
|
print(unique_usernames)
|
||||||
# senatorisakson was dropped, is ok
|
# senatorisakson was dropped, is ok
|
||||||
|
#%%
|
||||||
|
# create covidtweets csv
|
||||||
|
dfCov = dfAll[dfAll['contains_keyword']==True]
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# Export to csv, sav and dta
|
# Export to csv, sav and dta
|
||||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||||
|
dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
|
||||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||||
|
144
createGraphs.py
Normal file
144
createGraphs.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
#%%
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from wordcloud import WordCloud
|
||||||
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||||
|
import string
|
||||||
|
#%%
|
||||||
|
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Jun 26 20:36:43 2023
|
||||||
|
|
||||||
|
@author: michael
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
# import pyreadstat
|
||||||
|
# import numpy as np
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Setup directories
|
||||||
|
# WD Michael
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
# WD Server
|
||||||
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senDataset = "senators-raw.csv"
|
||||||
|
|
||||||
|
# Name of new datafile generated
|
||||||
|
senCSVc = "SenatorsTweets-Final.csv"
|
||||||
|
senCSVcCov = "SenatorsTweets-OnlyCov.csv"
|
||||||
|
|
||||||
|
# Outfiles
|
||||||
|
wcAllTweetsF = "graphs/Wordcloud-All.png"
|
||||||
|
wcCovTweetsF = "graphs/Wordcloud-Cov.png"
|
||||||
|
TwCovTimeline = "graphs/Timeline.png"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
senCSVcPath = wd + ud + senCSVc
|
||||||
|
senCSVcCovPath = wd + ud + senCSVcCov
|
||||||
|
wcAllTweetsFPath = wd + ud + wcAllTweetsF
|
||||||
|
wcCovTweetsFPath = wd + ud + wcCovTweetsF
|
||||||
|
TwCovTimelinePath = wd + ud + TwCovTimeline
|
||||||
|
|
||||||
|
#%%
|
||||||
|
df = pd.read_csv(senCSVcPath, dtype=(object))
|
||||||
|
dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
|
||||||
|
#%%
|
||||||
|
df['cleanContent'] = df['rawContent'].apply(remove_URL)
|
||||||
|
df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
|
||||||
|
df['cleanContent'] = df['cleanContent'].apply(remove_html)
|
||||||
|
df['cleanContent'] = df['cleanContent'].apply(remove_punct)
|
||||||
|
|
||||||
|
# create string with all cleaned tweets as text
|
||||||
|
str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||||
|
#%%
|
||||||
|
dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
|
||||||
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
|
||||||
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
|
||||||
|
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
|
||||||
|
|
||||||
|
# create string with all cleaned tweets as text
|
||||||
|
str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||||
|
#%%
|
||||||
|
# replace single U and S characters
|
||||||
|
str_covtweets = str_covtweets.replace(' u ', ' ')
|
||||||
|
str_covtweets = str_covtweets.replace(' s ', ' ')
|
||||||
|
str_alltweets = str_alltweets.replace(' u ', ' ')
|
||||||
|
str_alltweets = str_alltweets.replace(' s ', ' ')
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create wordcloud alltweets
|
||||||
|
wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||||
|
wcA.generate(str_alltweets)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# draw
|
||||||
|
plt.figure( figsize=(20,20))
|
||||||
|
plt.axis("off")
|
||||||
|
plt.imshow(wcA, interpolation="bilinear")
|
||||||
|
fig1 = plt.gcf()
|
||||||
|
plt.show()
|
||||||
|
fig1.savefig(wcAllTweetsFPath)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create wordcloud covtweets
|
||||||
|
wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||||
|
wcC.generate(str_covtweets)
|
||||||
|
#%%
|
||||||
|
# draw
|
||||||
|
plt.figure( figsize=(20,20))
|
||||||
|
plt.axis("off")
|
||||||
|
plt.imshow(wcC, interpolation="bilinear")
|
||||||
|
fig2 = plt.gcf()
|
||||||
|
plt.show()
|
||||||
|
fig2.savefig(wcCovTweetsFPath)
|
||||||
|
# %%
|
||||||
|
# with open('test.txt', 'w') as f:
|
||||||
|
# f.write(str_covtweets)
|
||||||
|
# %%
|
||||||
|
dfT = pd.DataFrame()
|
||||||
|
dfT['date'] = df['date'].copy()
|
||||||
|
dfT['count'] = 1
|
||||||
|
|
||||||
|
dfCovT = pd.DataFrame()
|
||||||
|
dfCovT['date'] = dfCov['date'].copy()
|
||||||
|
dfCovT['count'] = 1
|
||||||
|
#%%
|
||||||
|
dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
|
||||||
|
dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
#%%
|
||||||
|
dfT = dfT.groupby('date').count().reset_index()
|
||||||
|
dfCovT = dfCovT.groupby('date').count().reset_index()
|
||||||
|
|
||||||
|
#%%
|
||||||
|
import matplotlib.dates as mdates
|
||||||
|
# n of tweets overall
|
||||||
|
my_dpi=300
|
||||||
|
plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
|
||||||
|
plt.style.use('seaborn-darkgrid')
|
||||||
|
fig, ax = plt.subplots(figsize=(8, 6))
|
||||||
|
ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
|
||||||
|
ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
|
||||||
|
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
|
||||||
|
ax.xaxis.set_minor_locator(mdates.MonthLocator())
|
||||||
|
fig.autofmt_xdate()
|
||||||
|
fig.savefig(TwCovTimelinePath)
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
39
funs/CleanTweets.py
Normal file
39
funs/CleanTweets.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
def remove_URL(text):
|
||||||
|
url = re.compile(r'https?://\S+|www\.\S+')
|
||||||
|
return url.sub(r'', text)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_emoji(text):
|
||||||
|
emoji_pattern = re.compile(
|
||||||
|
'['
|
||||||
|
u'\U0001F600-\U0001F64F' # emoticons
|
||||||
|
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
||||||
|
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
||||||
|
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
||||||
|
u'\U00002702-\U000027B0'
|
||||||
|
u'\U000024C2-\U0001F251'
|
||||||
|
']+',
|
||||||
|
flags=re.UNICODE)
|
||||||
|
return emoji_pattern.sub(r'', text)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_html(text):
|
||||||
|
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
||||||
|
return re.sub(html, '', text)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_punct(text):
|
||||||
|
table = str.maketrans('', '', string.punctuation)
|
||||||
|
return text.translate(table)
|
||||||
|
|
||||||
|
def clean_all(text):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
text = str(text) # Convert non-string values to string
|
||||||
|
text = remove_URL(text)
|
||||||
|
text = remove_emoji(text)
|
||||||
|
text = remove_html(text)
|
||||||
|
text = remove_punct(text)
|
||||||
|
return text
|
Loading…
x
Reference in New Issue
Block a user