CollectUSSenatorTweets/collect.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun  6 11:40:07 2023

@author: michael
"""

import os
import tweepy
import pandas as pd
import numpy as np
import glob

## Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"

# WD Server
# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"

# WD Josie
# wd = "/home/michael/Documents/PS/Data/"

# WD Sam
# wd = "/home/michael/Documents/PS/Data/"

# Tweet-datafile directory
td = "data/tweets/"

os.chdir(wd)

## Setup Api-connection
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)

# Define time period of interest
start_time = '2020-01-01T00:00:00Z'
end_time = '2023-01-03T00:00:00Z'

# gather keywords @chenTrackingSocialMedia2020
# line80 ff:  lamsalCoronavirusCOVID19Tweets2020
# Initialize the keywords list
keywords = []

# Read the keywords from a file
with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

tweet_fields = [
	"id",
	"text",
	"attachments",
	"author_id",
	"context_annotations",
	"conversation_id",
	"created_at",
	"entities",
	"geo",
	"lang",
	"possibly_sensitive",
	"public_metrics",
	"referenced_tweets",
	"reply_settings",
	"source",
	"withheld",
	]

# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
print(accounts)
print(alt_accounts)

for handle in accounts:
	query = "from:"+ handle +" -is:retweet"

	tweetlist = []
	for tweet in tweepy.Paginator(client.search_all_tweets,
								  query=query,
								  tweet_fields = tweet_fields,
								  start_time=start_time,
								  end_time=end_time,
								  max_results=100).flatten(50):
		tweetlist.append(tweet)
	all_tweets = pd.DataFrame(tweetlist)
	all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))

	all_tweets['handle'] = handle

	## Extract referenced_tweet info from column
	# Create empty columns to store the extracted information
	all_tweets['referenced_tweet_type'] = None
	all_tweets['referenced_tweet_id'] = None

	# Iterate over each row
	for index, row in all_tweets.iterrows():
	    referenced_tweets = row['referenced_tweets']

	    # Check if referenced_tweets is not empty (array length > 0)
	    if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
	        referenced_tweet = referenced_tweets[0]
	        referenced_tweet_type = referenced_tweet['type']
	        referenced_tweet_id = referenced_tweet['id']

	        # Assign the extracted values to the new columns
	        all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
	        all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id

	## Check if tweet contains keyword
	# Create a new column to store the keyword match
	all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
	                                   .str.join(',')
	                                   .replace('', 'none'))

	## Save to versions of the dataset, one with all fields, one without dict fields
	csv_path = td + handle + ".csv"
	csv_path2 = td + handle + "-LONG.csv"
	all_tweets.to_csv(csv_path2)
	all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
	all_tweets.to_csv(csv_path)
	print("Fetched tweets for:")
	print(handle)

# Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
path_to_tweetdfs = wd + td
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format("csv"))

print(tweetfiles)

# save merged csv as two files
df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame()
for file in tweetfiles:
	if "LONG" in file:
		df = pd.read_csv(file)
		df_all_senators_long = pd.concat([df, df_all_senators_long])
	else:
		df = pd.read_csv(file)
		df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + "ALL-SENATORS.csv"
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
df_all_senators.to_csv(csv_path)
df_all_senators_long.to_csv(csv_path2)

os.chdir(wd)