CollectUSSenatorTweets/collect.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun  6 11:40:07 2023

@author: michael
"""

import os
import tweepy
import pandas as pd
import numpy as np
import glob
import time

## Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"

# WD Server
# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"

# WD Josie
# wd = "/home/michael/Documents/PS/Data/"

# WD Sam
# wd = "/home/michael/Documents/PS/Data/"

# Tweet-datafile directory
td = "data/tweets/"

os.chdir(wd)

## Setup Api-connection
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)

# Define time period of interest
# Define time periods of interest
time_slices = [
    {
        "start_time": "2020-01-01T00:00:00Z",
        "end_time": "2020-06-01T00:00:00Z",
        "suffix": "-slice1"
    },
    {
        "start_time": "2020-06-01T00:00:01Z",
        "end_time": "2021-01-01T00:00:00Z",
        "suffix": "-slice2"
    },
    {
        "start_time": "2021-01-01T00:00:01Z",
        "end_time": "2021-06-01T00:00:00Z",
        "suffix": "-slice3"
    },
    {
        "start_time": "2021-06-01T00:00:01Z",
        "end_time": "2023-01-03T00:00:00Z",
        "suffix": "-slice4"
    }
]

# gather keywords @chenTrackingSocialMedia2020
# line80 ff:  lamsalCoronavirusCOVID19Tweets2020
# Initialize the keywords list
keywords = []

# Read the keywords from a file
with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

tweet_fields = [
    "id",
    "text",
    "attachments",
    "author_id",
    "context_annotations",
    "conversation_id",
    "created_at",
    "entities",
    "geo",
    "lang",
    "possibly_sensitive",
    "public_metrics",
    "referenced_tweets",
    "reply_settings",
    "source",
    "withheld",
    ]

# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
print(accounts)
print(alt_accounts)

# Iterate over each Twitter account
for handle in accounts:
    for slice_data in time_slices:
        start_time = slice_data["start_time"]
        end_time = slice_data["end_time"]
        suffix = slice_data["suffix"]

        query = "from:" + handle + " -is:retweet"

        tweetlist = []
        # Fetch tweets using Twitter API pagination
        try:
            for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                tweetlist.append(tweet)
            msg = f"trying to fetch tweets for {handle}{suffix} fetched"
            print(msg)
        except tweepy.error.TweepError as ex:
            timestamp = datetime.now().timestamp()
            msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
            print(msg)
            time.sleep(1)
            try:
                for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                    tweetlist.append(tweet)
                msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
                print(msg)
            except tweepy.error.TweepError as ex:
                timestamp = datetime.now().timestamp()
                msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
                print(msg)
                time.sleep(1)

        all_tweets = pd.DataFrame(tweetlist)

        # Check if no tweets fetched for the current time slice
        if len(tweetlist) == 0:
            msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
            print(msg)
            continue

        all_tweets['handle'] = handle

        # Extract referenced_tweet info from column
        all_tweets['referenced_tweet_type'] = None
        all_tweets['referenced_tweet_id'] = None

        if 'referenced_tweets' in all_tweets.columns:
            for index, row in all_tweets.iterrows():
                referenced_tweets = row['referenced_tweets']

                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                    referenced_tweet = referenced_tweets[0]
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']

                    all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                    all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id

        # Check if tweet contains keyword
        if 'text' in all_tweets.columns:
            all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))

        # Save two versions of the dataset, one with all fields and one without dict fields
        csv_path = f"data/tweets/{handle}{suffix}.csv"
        csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
        all_tweets.to_csv(csv_path2)
        all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
        all_tweets.to_csv(csv_path)
        time.sleep(1) # sleep 1 second to not get over api limit

# Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
path_to_tweetdfs = wd + td
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format("csv"))

print(tweetfiles)

# save merged csv as two files
df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame()
for file in tweetfiles:
    if "LONG" in file:
        df = pd.read_csv(file)
        df_all_senators_long = pd.concat([df, df_all_senators_long])
    else:
        df = pd.read_csv(file)
        df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + "ALL-SENATORS.csv"
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
df_all_senators.to_csv(csv_path)
df_all_senators_long.to_csv(csv_path2)

os.chdir(wd)