adds collect script, keywords and senators csv

2023-06-07 18:02:27 +02:00
parent 08ea3b3f7f
commit a0c8df6a36
3 changed files with 437 additions and 0 deletions
--- a/collect.py
+++ b/collect.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun  6 11:40:07 2023
+
+@author: michael
+"""
+
+import os
+import tweepy
+import pandas as pd
+import numpy as np
+import glob
+
+## Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+
+# WD Server
+# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"
+
+# WD Josie
+# wd = "/home/michael/Documents/PS/Data/"
+
+# WD Sam
+# wd = "/home/michael/Documents/PS/Data/"
+
+# Tweet-datafile directory
+td = "data/tweets/"
+
+os.chdir(wd)
+
+## Setup Api-connection
+bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
+client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
+
+# Define time period of interest
+start_time = '2020-01-01T00:00:00Z'
+end_time = '2023-01-03T00:00:00Z'
+
+# gather keywords @chenTrackingSocialMedia2020
+# line80 ff:  lamsalCoronavirusCOVID19Tweets2020
+# Initialize the keywords list
+keywords = []
+
+# Read the keywords from a file
+with open("data/keywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+
+tweet_fields = [
+	"id",
+	"text",
+	"attachments",
+	"author_id",
+	"context_annotations",
+	"conversation_id",
+	"created_at",
+	"entities",
+	"geo",
+	"lang",
+	"possibly_sensitive",
+	"public_metrics",
+	"referenced_tweets",
+	"reply_settings",
+	"source",
+	"withheld",
+	]
+
+# Get accounts & alt-accounts from Senators-Datafile
+accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
+alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
+print(accounts)
+print(alt_accounts)
+
+for handle in accounts:
+	query = "from:"+ handle +" -is:retweet"
+	
+	tweetlist = []
+	for tweet in tweepy.Paginator(client.search_all_tweets, 
+								  query=query, 
+								  tweet_fields = tweet_fields,
+								  start_time=start_time, 
+								  end_time=end_time,
+								  max_results=100).flatten(50):
+		tweetlist.append(tweet)
+	all_tweets = pd.DataFrame(tweetlist)
+	all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
+	
+	all_tweets['handle'] = handle
+	
+	## Extract referenced_tweet info from column
+	# Create empty columns to store the extracted information
+	all_tweets['referenced_tweet_type'] = None
+	all_tweets['referenced_tweet_id'] = None
+	
+	# Iterate over each row
+	for index, row in all_tweets.iterrows():
+	    referenced_tweets = row['referenced_tweets']
+	    
+	    # Check if referenced_tweets is not empty (array length > 0)
+	    if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
+	        referenced_tweet = referenced_tweets[0]
+	        referenced_tweet_type = referenced_tweet['type']
+	        referenced_tweet_id = referenced_tweet['id']
+	        
+	        # Assign the extracted values to the new columns
+	        all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
+	        all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
+	
+	## Check if tweet contains keyword
+	# Create a new column to store the keyword match
+	all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
+	                                   .str.join(',')
+	                                   .replace('', 'none'))
+	
+	## Save to versions of the dataset, one with all fields, one without dict fields
+	csv_path = td + handle + ".csv"
+	csv_path2 = td + handle + "-LONG.csv"
+	all_tweets.to_csv(csv_path2)
+	all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
+	all_tweets.to_csv(csv_path)
+	print("Fetched tweets for:")
+	print(handle)
+
+# Merge CSV-Files
+# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
+path_to_tweetdfs = wd + td
+os.chdir(path_to_tweetdfs)
+tweetfiles = glob.glob('*.{}'.format("csv"))
+
+print(tweetfiles)
+
+# save merged csv as two files 
+df_all_senators = pd.DataFrame()
+df_all_senators_long = pd.DataFrame()
+for file in tweetfiles:
+	if "LONG" in file:
+		df = pd.read_csv(file)
+		df_all_senators_long = pd.concat([df, df_all_senators_long])
+	else:
+		df = pd.read_csv(file)
+		df_all_senators = pd.concat([df, df_all_senators])
+csv_path = td + "ALL-SENATORS.csv"
+csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
+df_all_senators.to_csv(csv_path)    
+df_all_senators_long.to_csv(csv_path2)
+
+os.chdir(wd)