CollectUSSenatorTweets/collect.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Jun  6 11:40:07 2023

@author: michael
'''

import os
import tweepy
import pandas as pd
import numpy as np
import glob
import time

## Setup directories
# WD Michael
wd = '/home/michael/Documents/PS/Data/collectTweets/'

# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'

# WD Josie
# wd = '/home/michael/Documents/PS/Data/'

# WD Sam
# wd = '/home/michael/Documents/PS/Data/'

# Tweet-datafile directory
td = 'data/tweets/'

os.chdir(wd)

## Setup Api-connection
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)

# Define time period of interest
# Define time periods of interest
time_slices = [
    {
        'start_time': '2020-01-01T00:00:00Z',
        'end_time': '2020-06-01T00:00:00Z',
        'suffix': '-slice1'
    },
    {
        'start_time': '2020-06-01T00:00:01Z',
        'end_time': '2021-01-01T00:00:00Z',
        'suffix': '-slice2'
    },
    {
        'start_time': '2021-01-01T00:00:01Z',
        'end_time': '2021-06-01T00:00:00Z',
        'suffix': '-slice3'
    },
    {
        'start_time': '2021-06-01T00:00:01Z',
        'end_time': '2023-01-03T00:00:00Z',
        'suffix': '-slice4'
    }
]

# gather keywords @chenTrackingSocialMedia2020
# line80 ff:  lamsalCoronavirusCOVID19Tweets2020
# Initialize the keywords list
keywords = []

# Read the keywords from a file
with open('data/keywords.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

tweet_fields = [
    'id',
    'text',
    'attachments',
    'author_id',
    'context_annotations',
    'conversation_id',
    'created_at',
    'entities',
    'geo',
    'lang',
    'possibly_sensitive',
    'public_metrics',
    'referenced_tweets',
    'reply_settings',
    'source',
    'withheld',
    ]

# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
print(accounts)
print(alt_accounts)

# Iterate over each Twitter account
for handle in accounts:
    for slice_data in time_slices:
        # define slice data variables from time_slices
        start_time = slice_data['start_time']
        end_time = slice_data['end_time']
        suffix = slice_data['suffix']

        # define tweepy query with twitter handle of current sen
        query = f'from:{handle} -is:retweet'

        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []

        # statusmsg
        msg = f'trying to fetch tweets for {handle}{suffix}'
        print(msg)

        # Fetch tweets using tweepy Twitter API v2 pagination
        tweets = tweepy.Paginator(client.search_all_tweets,
                                      query=query,
                                      tweet_fields=tweet_fields,
                                      start_time=start_time,
                                      end_time=end_time,
                                      max_results=20).flatten(20)

        # for each tweet returned...
        for tweet in tweets:
            # ... add that tweet to tweetlist
            tweetlist.append(tweet)

        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(tweetlist) == 0:
            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
            print(msg)
            continue

        # convert to dataframe
        tweet_df = pd.DataFrame(tweetlist)

        # add handle column as api only provides user-ids
        tweet_df['handle'] = handle

        ## Extract referenced_tweet info from column
        tweet_df['referenced_tweet_type'] = None
        tweet_df['referenced_tweet_id'] = None

        # if cond. because in some cases column doesn't exist
        if 'referenced_tweets' in tweet_df.columns:
            for index, row in tweet_df.iterrows():
                referenced_tweets = row['referenced_tweets']

                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                    referenced_tweet = referenced_tweets[0]
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']

                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id

        ## Check if tweet-text contains keyword
        # if cond. because in some cases column doesn't exist
        if 'text' in tweet_df.columns:
            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))

        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
        csv_path = f'data/tweets/{handle}{suffix}.csv'
        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
        # save LONG csv
        tweet_df.to_csv(csv_path2)
        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
        # if cond. because in some cases column doesn't exist
        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
        # save short csv
        tweet_df.to_csv(csv_path)
        # sleep 1 second to not get over 1sec api limit
        time.sleep(1)

# Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
path_to_tweetdfs = wd + td
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format('csv'))

print(tweetfiles)

# save merged csv as two files
df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame()
for file in tweetfiles:
    if 'LONG' in file:
        df = pd.read_csv(file)
        df_all_senators_long = pd.concat([df, df_all_senators_long])
    else:
        df = pd.read_csv(file)
        df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + 'ALL-SENATORS.csv'
csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
df_all_senators.to_csv(csv_path)
df_all_senators_long.to_csv(csv_path2)

os.chdir(wd)