adds CleanTweets functions, creates Graphs
This commit is contained in:
		
							
								
								
									
										28
									
								
								analyze.py
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								analyze.py
									
									
									
									
									
								
							| @@ -5,6 +5,7 @@ import pandas as pd | ||||
| from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | ||||
| from datasets import load_dataset | ||||
| from transformers.pipelines.pt_utils import KeyDataset | ||||
| from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct | ||||
|  | ||||
|  | ||||
| #%% | ||||
| @@ -82,33 +83,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter- | ||||
| tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") | ||||
|  | ||||
| # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert | ||||
| def remove_URL(text): | ||||
|     url = re.compile(r'https?://\S+|www\.\S+') | ||||
|     return url.sub(r'', text) | ||||
|  | ||||
|  | ||||
| def remove_emoji(text): | ||||
|     emoji_pattern = re.compile( | ||||
|         '[' | ||||
|         u'\U0001F600-\U0001F64F'  # emoticons | ||||
|         u'\U0001F300-\U0001F5FF'  # symbols & pictographs | ||||
|         u'\U0001F680-\U0001F6FF'  # transport & map symbols | ||||
|         u'\U0001F1E0-\U0001F1FF'  # flags (iOS) | ||||
|         u'\U00002702-\U000027B0' | ||||
|         u'\U000024C2-\U0001F251' | ||||
|         ']+', | ||||
|         flags=re.UNICODE) | ||||
|     return emoji_pattern.sub(r'', text) | ||||
|  | ||||
|  | ||||
| def remove_html(text): | ||||
|     html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') | ||||
|     return re.sub(html, '', text) | ||||
|  | ||||
|  | ||||
| def remove_punct(text): | ||||
|     table = str.maketrans('', '', string.punctuation) | ||||
|     return text.translate(table) | ||||
|  | ||||
| dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL) | ||||
| dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji) | ||||
|   | ||||
| @@ -8,7 +8,7 @@ Created on Mon Jun 26 20:36:43 2023 | ||||
|  | ||||
| import pandas as pd | ||||
| # import pyreadstat | ||||
| # import numpy as np | ||||
| import numpy as np | ||||
| from funs.ClearDupes import deDupe | ||||
|  | ||||
|  | ||||
| @@ -32,11 +32,13 @@ senCSV = "ALL-SENATORS-TWEETS.csv" | ||||
| senDataset = "senators-raw.csv" | ||||
|  | ||||
| # Name of new datafile generated | ||||
| senCSVc = "Tweets-Cleaned" | ||||
| senCSVc = "SenatorsTweets-Final" | ||||
| senCSVcCov = "SenatorsTweets-OnlyCov" | ||||
|  | ||||
| # don't change this one | ||||
| senCSVPath = wd + ud + senCSV | ||||
| senCSVcPath = wd + ud + senCSV + ".csv" | ||||
| senCSVcPath = wd + ud + senCSVc + ".csv" | ||||
| senCSVcCovPath = wd + ud + senCSVcCov + ".csv" | ||||
| senSAVcPath = wd + ud + senCSV + ".sav" | ||||
| senDTAcPath = wd + ud + senCSV + ".dta" | ||||
| senDatasetPath = wd + di + senDataset | ||||
| @@ -90,21 +92,17 @@ with open(f"{di}keywords.txt", "w") as file: | ||||
|  | ||||
| #%% | ||||
| # overwrite keyword column | ||||
| df['contains_keyword'] = '' | ||||
| df['contains_keyword'] = ( | ||||
|     df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none') | ||||
| df['keywords'] = np.nan | ||||
| df['keywords'] = ( | ||||
|     df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) | ||||
| ) | ||||
| mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none' | ||||
| df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask | ||||
| #%% | ||||
| # create bool contains_keyword | ||||
| df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column | ||||
| #%% | ||||
| # recode contains keyword to bool | ||||
| mask = (df['contains_keyword'] != 'none') | ||||
| df.loc[mask,'contains_keyword'] = True | ||||
| df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords | ||||
| # create boolean contains_keyword column | ||||
| df['contains_keyword'] = True | ||||
| mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' | ||||
| df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask | ||||
|  | ||||
| #%% | ||||
| pd.Series(df["user.id"]).is_unique | ||||
|  | ||||
| #%% | ||||
| @@ -157,10 +155,14 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username') | ||||
| unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique() | ||||
| print(unique_usernames) | ||||
| # senatorisakson was dropped, is ok | ||||
| #%% | ||||
| # create covidtweets csv | ||||
| dfCov = dfAll[dfAll['contains_keyword']==True] | ||||
|  | ||||
| #%% | ||||
| # Export to csv, sav and dta | ||||
| dfAll.to_csv(senCSVcPath, encoding='utf-8') | ||||
| dfCov.to_csv(senCSVcCovPath, encoding='utf-8') | ||||
| # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb  | ||||
| # ============================================================================= | ||||
| # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) | ||||
|   | ||||
							
								
								
									
										144
									
								
								createGraphs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										144
									
								
								createGraphs.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,144 @@ | ||||
| #%% | ||||
| #!/usr/bin/env python3 | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| import matplotlib.pyplot as plt | ||||
| from wordcloud import WordCloud | ||||
| from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct | ||||
| import string | ||||
| #%% | ||||
|  | ||||
| # -*- coding: utf-8 -*- | ||||
| """ | ||||
| Created on Mon Jun 26 20:36:43 2023 | ||||
|  | ||||
| @author: michael | ||||
| """ | ||||
|  | ||||
| import pandas as pd | ||||
| # import pyreadstat | ||||
| # import numpy as np | ||||
|  | ||||
| ################### | ||||
| # Setup directories | ||||
| # WD Michael | ||||
| wd = "/home/michael/Documents/PS/Data/collectTweets/" | ||||
| # WD Server | ||||
| # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' | ||||
|  | ||||
| # datafile input directory | ||||
| di = "data/IN/" | ||||
|  | ||||
| # Tweet-datafile output directory | ||||
| ud = "data/OUT/" | ||||
|  | ||||
| # Name of file that all senator data will be written to | ||||
| senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv | ||||
|  | ||||
| # Name of file that all senator data will be written to | ||||
| senDataset = "senators-raw.csv" | ||||
|  | ||||
| # Name of new datafile generated | ||||
| senCSVc = "SenatorsTweets-Final.csv" | ||||
| senCSVcCov = "SenatorsTweets-OnlyCov.csv" | ||||
|  | ||||
| # Outfiles | ||||
| wcAllTweetsF = "graphs/Wordcloud-All.png" | ||||
| wcCovTweetsF = "graphs/Wordcloud-Cov.png" | ||||
| TwCovTimeline = "graphs/Timeline.png" | ||||
|  | ||||
| # don't change this one | ||||
| senCSVcPath = wd + ud + senCSVc | ||||
| senCSVcCovPath = wd + ud + senCSVcCov | ||||
| wcAllTweetsFPath = wd + ud + wcAllTweetsF | ||||
| wcCovTweetsFPath = wd + ud + wcCovTweetsF | ||||
| TwCovTimelinePath = wd + ud + TwCovTimeline | ||||
|  | ||||
| #%% | ||||
| df = pd.read_csv(senCSVcPath, dtype=(object)) | ||||
| dfCov = pd.read_csv(senCSVcCovPath, dtype=(object)) | ||||
| #%% | ||||
| df['cleanContent'] = df['rawContent'].apply(remove_URL) | ||||
| df['cleanContent'] = df['cleanContent'].apply(remove_emoji) | ||||
| df['cleanContent'] = df['cleanContent'].apply(remove_html) | ||||
| df['cleanContent'] = df['cleanContent'].apply(remove_punct) | ||||
|  | ||||
| # create string with all cleaned tweets as text | ||||
| str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold() | ||||
| #%% | ||||
| dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL) | ||||
| dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji) | ||||
| dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html) | ||||
| dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct) | ||||
|  | ||||
| # create string with all cleaned tweets as text | ||||
| str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold() | ||||
| #%% | ||||
| # replace single U and S characters | ||||
| str_covtweets = str_covtweets.replace(' u ', ' ')  | ||||
| str_covtweets = str_covtweets.replace(' s ', ' ')  | ||||
| str_alltweets = str_alltweets.replace(' u ', ' ')  | ||||
| str_alltweets = str_alltweets.replace(' s ', ' ')  | ||||
|  | ||||
|  | ||||
| # %% | ||||
| # create wordcloud alltweets | ||||
| wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True) | ||||
| wcA.generate(str_alltweets) | ||||
|  | ||||
| #%% | ||||
| # draw | ||||
| plt.figure( figsize=(20,20)) | ||||
| plt.axis("off") | ||||
| plt.imshow(wcA, interpolation="bilinear") | ||||
| fig1 = plt.gcf() | ||||
| plt.show() | ||||
| fig1.savefig(wcAllTweetsFPath)   | ||||
|  | ||||
| # %% | ||||
| # create wordcloud covtweets | ||||
| wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True) | ||||
| wcC.generate(str_covtweets) | ||||
| #%% | ||||
| # draw | ||||
| plt.figure( figsize=(20,20)) | ||||
| plt.axis("off") | ||||
| plt.imshow(wcC, interpolation="bilinear") | ||||
| fig2 = plt.gcf() | ||||
| plt.show() | ||||
| fig2.savefig(wcCovTweetsFPath)   | ||||
| # %% | ||||
| # with open('test.txt', 'w') as f: | ||||
| #    f.write(str_covtweets) | ||||
| # %% | ||||
| dfT = pd.DataFrame() | ||||
| dfT['date'] = df['date'].copy() | ||||
| dfT['count'] = 1 | ||||
|  | ||||
| dfCovT = pd.DataFrame() | ||||
| dfCovT['date'] = dfCov['date'].copy() | ||||
| dfCovT['count'] = 1 | ||||
| #%% | ||||
| dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d') | ||||
| dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d') | ||||
|  | ||||
| #%% | ||||
| dfT = dfT.groupby('date').count().reset_index() | ||||
| dfCovT = dfCovT.groupby('date').count().reset_index() | ||||
|  | ||||
| #%% | ||||
| import matplotlib.dates as mdates | ||||
| # n of tweets overall | ||||
| my_dpi=300 | ||||
| plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi) | ||||
| plt.style.use('seaborn-darkgrid') | ||||
| fig, ax = plt.subplots(figsize=(8, 6)) | ||||
| ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4) | ||||
| ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1) | ||||
| ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3)) | ||||
| ax.xaxis.set_minor_locator(mdates.MonthLocator()) | ||||
| fig.autofmt_xdate() | ||||
| fig.savefig(TwCovTimelinePath)   | ||||
|  | ||||
|  | ||||
| # %% | ||||
							
								
								
									
										39
									
								
								funs/CleanTweets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								funs/CleanTweets.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | ||||
| import re | ||||
| import string | ||||
|  | ||||
| def remove_URL(text): | ||||
|     url = re.compile(r'https?://\S+|www\.\S+') | ||||
|     return url.sub(r'', text) | ||||
|  | ||||
|  | ||||
| def remove_emoji(text): | ||||
|     emoji_pattern = re.compile( | ||||
|         '[' | ||||
|         u'\U0001F600-\U0001F64F'  # emoticons | ||||
|         u'\U0001F300-\U0001F5FF'  # symbols & pictographs | ||||
|         u'\U0001F680-\U0001F6FF'  # transport & map symbols | ||||
|         u'\U0001F1E0-\U0001F1FF'  # flags (iOS) | ||||
|         u'\U00002702-\U000027B0' | ||||
|         u'\U000024C2-\U0001F251' | ||||
|         ']+', | ||||
|         flags=re.UNICODE) | ||||
|     return emoji_pattern.sub(r'', text) | ||||
|  | ||||
|  | ||||
| def remove_html(text): | ||||
|     html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') | ||||
|     return re.sub(html, '', text) | ||||
|  | ||||
|  | ||||
| def remove_punct(text): | ||||
|     table = str.maketrans('', '', string.punctuation) | ||||
|     return text.translate(table) | ||||
|  | ||||
| def clean_all(text): | ||||
|     if not isinstance(text, str): | ||||
|         text = str(text) # Convert non-string values to string | ||||
|     text = remove_URL(text) | ||||
|     text = remove_emoji(text) | ||||
|     text = remove_html(text) | ||||
|     text = remove_punct(text) | ||||
|     return text | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck