# Tweet Collecting
## Requirements
- tweepy-4.14.0
- pandas-2.0
- numpy-1.24.3

## Preparations & Config


In [29]:
import os
import tweepy
import pandas as pd
import numpy as np
import glob
import time

# Define time period of interest
time_slices = [
    {
        "start_time": "2020-01-01T00:00:00Z",
        "end_time": "2020-06-01T00:00:00Z",
        "suffix": "-slice1"
    },
    {
        "start_time": "2020-06-01T00:00:01Z",
        "end_time": "2021-01-01T00:00:00Z",
        "suffix": "-slice2"
    },
    {
        "start_time": "2021-01-01T00:00:01Z",
        "end_time": "2021-06-01T00:00:00Z",
        "suffix": "-slice3"
    },
    {
        "start_time": "2021-06-01T00:00:01Z",
        "end_time": "2023-01-03T00:00:00Z",
        "suffix": "-slice4"
    }
]

tweet_fields = [
	"id",
	"text",
	"attachments",
	"author_id",
	"context_annotations",
	"conversation_id",
	"created_at",
	"entities",
	"geo",
	"lang",
	"possibly_sensitive",
	"public_metrics",
	"referenced_tweets",
	"reply_settings",
	"source",
	"withheld",
	]

## Setup directories
# WD Michael
# wd = "/home/michael/Documents/PS/Data/collectTweets/"

# WD Server
wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"

# WD Josie
# wd = "/home/michael/Documents/PS/Data/"

# WD Sam
# wd = "/home/michael/Documents/PS/Data/"

# Tweet-datafile directory
td = "data/tweets/"

# Authenticate to Twitter

In [2]:
## Setup Api-connection
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)

## Import Keywords
Keywords from:
* Chen, E., Lerman, K., & Ferrara, E. (2020). Tracking Social Media Discourse About the COVID-19 Pandemic: Development of a Public Coronavirus Twitter Data Set. JMIR Public Health and Surveillance, 6(2), e19273. https://doi.org/10.2196/19273
Line 80 and following:
* Lamsal, R. (2020). Coronavirus (COVID-19) Tweets Dataset [Data set]. IEEE. https://ieee-dataport.org/open-access/coronavirus-covid-19-tweets-dataset

In [3]:
keywords = []

# Read the keywords from a file
with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

keywords

['Coronavirus',
 'Koronavirus',
 'Corona',
 'CDC',
 'Wuhancoronavirus',
 'Wuhanlockdown',
 'Ncov',
 'Wuhan',
 'N95',
 'Kungflu',
 'Epidemic',
 'outbreak',
 'Sinophobia',
 'China',
 'covid-19',
 'corona virus',
 'covid',
 'covid19',
 'sars-cov-2',
 'COVIDãƒ¼19',
 'COVD',
 'pandemic',
 'coronapocalypse',
 'canceleverything',
 'Coronials',
 'SocialDistancingNow',
 'Social Distancing',
 'SocialDistancing',
 'panicbuy',
 'panic buy',
 'panicbuying',
 'panic buying',
 '14DayQuarantine',
 'DuringMy14DayQuarantine',
 'panic shop',
 'panic shopping',
 'panicshop',
 'InMyQuarantineSurvivalKit',
 'panic-buy',
 'panic-shop',
 'coronakindness',
 'quarantinelife',
 'chinese virus',
 'chinesevirus',
 'stayhomechallenge',
 'stay home challenge',
 'sflockdown',
 'DontBeASpreader',
 'lockdown',
 'lock down',
 'shelteringinplace',
 'sheltering in place',
 'staysafestayhome',
 'stay safe stay home',
 'trumppandemic',
 'trump pandemic',
 'flattenthecurve',
 'flatten the curve',
 'china virus',
 'chinavirus

## Import Accounts

In [4]:
# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
print(accounts)
print(alt_accounts)

['SenAlexander', 'SenatorEnzi', 'CoryGardner', 'VP', 'SenatorIsakson', 'DougJones', 'KLoeffler', 'MarthaMcSallyAZ', 'DavidPerdueGA', 'SenPatRoberts', 'SenatorTomUdall', 'SenatorBaldwin', 'SenJohnBarrasso', 'SenatorBennet', 'MarshaBlackburn', 'SenBlumenthal', 'RoyBlunt', 'senbooker', 'JohnBoozman', 'SenatorBraun', 'SenSherrodBrown', 'SenatorBurr', 'SenatorCantwell', 'SenCapito', 'SenatorCardin', 'SenatorCarper', 'SenBobCasey', 'SenBillCassidy', 'SenatorCollins', 'ChrisCoons', 'JohnCornyn', 'SenCortezMasto', 'SenTomCotton', 'SenKevinCramer', 'MikeCrapo', 'SenTedCruz', 'SteveDaines', 'SenDuckworth', 'SenatorDurbin', 'SenJoniErnst', 'SenFettermanPA', 'SenFeinstein', 'SenatorFischer', 'SenGillibrand', 'LindseyGrahamSC', 'ChuckGrassley', 'SenatorHagerty', 'SenatorHassan', 'HawleyMO', 'MartinHeinrich', 'SenatorHick', 'maziehirono', 'SenJohnHoeven', 'SenHydeSmith', 'JimInhofe', 'SenRonJohnson', 'timkaine', 'SenMarkKelly', 'SenJohnKennedy', 'SenAngusKing', 'SenAmyKlobuchar', 'SenatorLankford', 

## Collect Tweets
Loops over accounts:
* Collects Tweets of account. 
* Then extracts columns public_metrics (likes aso) and referenced_tweets (indicates, whether tweet is a reply).
* Checks if tweet-text contains any of the keywords, if so, inserts the keyword(s) in a new column.
* Saves tweets of the account in a csv file "HANDLE.csv" and "HANDLE-LONG.csv" (LONG contains all given information such as annotations, that we might or might not need)

### Problem:
_I limited the results to 20 tweets per senator._
Twitter has the following API Limit for the [search_all_tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) method I used: 
* App rate limit (Application-only): 300 requests per 15-minute window shared among all users of your app
* App rate limit (Application-only): 1 per second shared among all users of your app

With a limit of 300, I request 20 posts per slice, just to get a better understanding of what's happening. After trying different things out, I think that the time-slices won't be needed if we get around the problem I'm having right now:
as soon, as the rate limit is reached, tweepy stops and waits for the time to run out and start again. BUT it doesn't retry the request but starts with the next request. 
I haven't found anything and my only idea to solve the problem was to generate a list of failed attempts (via try and except) and after getting all tweets letting tweepy work over that list again. 
One more thing I don't understand is that, when fetching the tweets I already sent to you, I didn't have as many problems as now and the limit exceeded after 3-4 senators, even though I used a higher `max_result` and a higher `flatten value`.

I hope that the following output speaks for itself:
```
trying to fetch tweets for SenAlexander-slice1
trying to fetch tweets for SenAlexander-slice2
trying to fetch tweets for SenAlexander-slice3
trying to fetch tweets for SenAlexander-slice4
trying to fetch tweets for SenatorEnzi-slice1
trying to fetch tweets for SenatorEnzi-slice2
trying to fetch tweets for SenatorEnzi-slice3
return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
trying to fetch tweets for SenatorEnzi-slice4

Rate limit exceeded. Sleeping for 893 seconds.
```

Tweepy returned no tweets because of the exceeded tweet limit, then the script tried to fetch more tweets and the error message came up.
Before changing the code below, see the other version i wrote just below the next cell (and ignore the error message below the cell as i just interrupted the execution which lead to the error message).

In [28]:
# Iterate over each Twitter account
for handle in accounts:
    for slice_data in time_slices:
        # sleep 1 second to not get over 1sec api limit
        time.sleep(1) 
        # define slice data variables from time_slices
        start_time = slice_data['start_time']
        end_time = slice_data['end_time']
        suffix = slice_data['suffix']
        
        # define tweepy query with twitter handle of current sen
        query = f'from:{handle} -is:retweet'
        
        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []
        
        # statusmsg
        msg = f'trying to fetch tweets for {handle}{suffix}'
        print(msg)
        
        # Fetch tweets using tweepy Twitter API v2 pagination
        tweets = tweepy.Paginator(client.search_all_tweets,
                                      query=query,
                                      tweet_fields=tweet_fields,
                                      start_time=start_time,
                                      end_time=end_time,
                                      max_results=20).flatten(20)
        
        # for each tweet returned...
        for tweet in tweets:
            # ... add that tweet to tweetlist
            tweetlist.append(tweet)
        
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(tweetlist) == 0:
            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
            print(msg)
            print(tweets)
            continue
        
        # convert to dataframe
        tweet_df = pd.DataFrame(tweetlist)
        
        # add handle column as api only provides user-ids
        tweet_df['handle'] = handle
        
        ## Extract referenced_tweet info from column
        tweet_df['referenced_tweet_type'] = None
        tweet_df['referenced_tweet_id'] = None
        
        # if cond. because in some cases column doesn't exist
        if 'referenced_tweets' in tweet_df.columns:
            for index, row in tweet_df.iterrows():
                referenced_tweets = row['referenced_tweets']
                
                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                    referenced_tweet = referenced_tweets[0]
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']
                    
                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
        
        ## Check if tweet-text contains keyword
        # if cond. because in some cases column doesn't exist
        if 'text' in tweet_df.columns:
            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
        
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
        csv_path = f'data/tweets/{handle}{suffix}.csv'
        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
        # save LONG csv
        tweet_df.to_csv(csv_path2)
        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
        # if cond. because in some cases column doesn't exist
        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
        # save short csv
        tweet_df.to_csv(csv_path)

trying to fetch tweets for SenAlexander-slice1
trying to fetch tweets for SenAlexander-slice2
trying to fetch tweets for SenAlexander-slice3
trying to fetch tweets for SenAlexander-slice4
trying to fetch tweets for SenatorEnzi-slice1
trying to fetch tweets for SenatorEnzi-slice2
trying to fetch tweets for SenatorEnzi-slice3
return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
<generator object Paginator.flatten at 0x7f20ebf137b0>
trying to fetch tweets for SenatorEnzi-slice4
trying to fetch tweets for CoryGardner-slice1
trying to fetch tweets for CoryGardner-slice2
trying to fetch tweets for CoryGardner-slice3
return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
<generator object Paginator.flatten at 0x7f20ebf13740>
trying to fetch tweets for CoryGardner-slice4
trying to fetch tweets for VP-slice1
trying to fetch tweets for VP-slice2
trying to fetch tweets for VP-slice3
trying to fetch tweets for VP-slice4
trying to fe

KeyboardInterrupt: 

## Alternative way to fetch tweets via tweepy with retry mechanism

In [21]:
# Iterate over each Twitter account
for handle in accounts:
    for slice_data in time_slices:
        # define slice data variables from time_slices
        start_time = slice_data['start_time']
        end_time = slice_data['end_time']
        suffix = slice_data['suffix']
        
        # define tweepy query with twitter handle of current sen
        query = f'from:{handle} -is:retweet'
        
        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []
        
        # statusmsg
        msg = f'trying to fetch tweets for {handle}{suffix}'
        print(msg)
        
        # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism
        max_attempts = 3  # maximum number of attempts to fetch tweets for a slice
        attempt = 1
        
        while attempt <= max_attempts:
            try:
                tweets = tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=20).flatten(20)
                
                # for each tweet returned...
                for tweet in tweets:
                    # ... add that tweet to tweetlist
                    tweetlist.append(tweet)
                
                break  # exit the retry loop if tweets are successfully fetched
            
            except tweepy.TweepError as e:
                # handle rate limit exceeded error
                if e.response.status_code == 429:
                    # get the rate limit reset time from the response headers
                    reset_time = int(e.response.headers['x-rate-limit-reset'])
                    current_time = int(time.time())
                    
                    # calculate the sleep time until the rate limit resets
                    sleep_time = reset_time - current_time + 1  # add an extra second
                    
                    # sleep until the rate limit resets
                    time.sleep(sleep_time)
                    
                    attempt += 1  # increment the attempt counter
                    continue  # retry the API call
                
                else:
                    # handle other types of Tweepy errors
                    print(f'Error occurred: {e}')
                    break
        
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(tweetlist) == 0:
            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
            print(msg)
            continue
        
        # convert to dataframe
        tweet_df = pd.DataFrame(tweetlist)
        
        # add handle column as api only provides user-ids
        tweet_df['handle'] = handle
        
        ## Extract referenced_tweet info from column
        tweet_df['referenced_tweet_type'] = None
        tweet_df['referenced_tweet_id'] = None
        
        # if cond. because in some cases column doesn't exist
        if 'referenced_tweets' in tweet_df.columns:
            for index, row in tweet_df.iterrows():
                referenced_tweets = row['referenced_tweets']
                
                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                    referenced_tweet = referenced_tweets[0]
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']
                    
                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
        
        ## Check if tweet-text contains keyword
        # if cond. because in some cases column doesn't exist
        if 'text' in tweet_df.columns:
            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
        
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
        csv_path = f'data/tweets/{handle}{suffix}.csv'
        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
        # save LONG csv
        tweet_df.to_csv(csv_path2)
        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
        # if cond. because in some cases column doesn't exist
        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
        # save short csv
        tweet_df.to_csv(csv_path)
        
        # sleep 1 second to not exceed the API rate limit
        time.sleep(1)


trying to fetch tweets for SenAlexander-slice1
trying to fetch tweets for SenAlexander-slice2
trying to fetch tweets for SenAlexander-slice3
trying to fetch tweets for SenAlexander-slice4
trying to fetch tweets for SenatorEnzi-slice1
trying to fetch tweets for SenatorEnzi-slice2
trying to fetch tweets for SenatorEnzi-slice3
return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
trying to fetch tweets for SenatorEnzi-slice4


Rate limit exceeded. Sleeping for 437 seconds.


trying to fetch tweets for CoryGardner-slice1
trying to fetch tweets for CoryGardner-slice2
trying to fetch tweets for CoryGardner-slice3
return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
trying to fetch tweets for CoryGardner-slice4


Rate limit exceeded. Sleeping for 897 seconds.


AttributeError: module 'tweepy' has no attribute 'TweepError'

In [None]:
path_to_tweetdfs = wd + td
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format("csv"))

print(tweetfiles)

# save merged csv as two files 
df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame()
for file in tweetfiles:
	if "LONG" in file:
		df = pd.read_csv(file)
		df_all_senators_long = pd.concat([df, df_all_senators_long])
	else:
		df = pd.read_csv(file)
		df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + "ALL-SENATORS.csv"
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
df_all_senators.to_csv(csv_path)    
df_all_senators_long.to_csv(csv_path2)


In [24]:
# Iterate over each Twitter account
for handle in accounts:
    for slice_data in time_slices:
        time.sleep(1.01)
        # define slice data variables from time_slices
        start_time = slice_data['start_time']
        end_time = slice_data['end_time']
        suffix = slice_data['suffix']
        
        # define tweepy query with twitter handle of current sen
        query = f'from:{handle} -is:retweet'
        
        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []
        
        # statusmsg
        msg = f'trying to fetch tweets for {handle}{suffix}'
        print(msg)
        
        # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism
        max_attempts = 3  # maximum number of attempts to fetch tweets for a slice
        attempt = 1
        
        while attempt <= max_attempts:
            try:
                tweets = tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=20).flatten(20)
                
                # for each tweet returned...
                for tweet in tweets:
                    # ... add that tweet to tweetlist
                    tweetlist.append(tweet)
                
                # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
                if len(tweetlist) == 0:
                    msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
                    print(msg)
                    break
                
                # convert to dataframe
                tweet_df = pd.DataFrame(tweetlist)
                
                # add handle column as API only provides user-ids
                tweet_df['handle'] = handle
                
                ## Extract referenced_tweet info from column
                tweet_df['referenced_tweet_type'] = None
                tweet_df['referenced_tweet_id'] = None
                
                # if cond. because in some cases column doesn't exist
                if 'referenced_tweets' in tweet_df.columns:
                    for index, row in tweet_df.iterrows():
                        referenced_tweets = row['referenced_tweets']
                        
                        if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                            referenced_tweet = referenced_tweets[0]
                            referenced_tweet_type = referenced_tweet['type']
                            referenced_tweet_id = referenced_tweet['id']
                            
                            tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                            tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
                
                ## Check if tweet-text contains keyword
                # if cond. because in some cases column doesn't exist
                if 'text' in tweet_df.columns:
                    tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
                                                      .str.join(',')
                                                      .replace('', 'none'))
                
                ## Save two versions of the dataset, one with all fields and one without dict fields
                # define filepaths
                csv_path = f'data/tweets/{handle}{suffix}.csv'
                csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
                # save LONG csv
                tweet_df.to_csv(csv_path2)
                # Remove 'context_annotations', 'entities', and 'referenced_tweets' columns for short csv files
                # if cond. because in some cases column doesn't exist
                if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
                    tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
                # save short csv
                tweet_df.to_csv(csv_path)
                
                # break out of the retry loop since fetching tweets was successful
                break
            
            except tweepy.TweepError as e:
                if e.response.status_code == 429:  # rate limit exceeded
                    reset_time = int(e.response.headers['x-rate-limit-reset'])
                    wait_time = reset_time - time.time() + 5  # add additional 5 seconds as buffer
                    
                    print(f"Rate limit exceeded. Sleeping for {wait_time} seconds.")
                    time.sleep(wait_time)
                    
                    attempt += 1  # increment the attempt counter
                else:
                    print(f"Error occurred: {e}")
                    break

trying to fetch tweets for SenAlexander-slice1
trying to fetch tweets for SenAlexander-slice2
trying to fetch tweets for SenAlexander-slice3
trying to fetch tweets for SenAlexander-slice4
trying to fetch tweets for SenatorEnzi-slice1
trying to fetch tweets for SenatorEnzi-slice2
trying to fetch tweets for SenatorEnzi-slice3
return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
trying to fetch tweets for SenatorEnzi-slice4
trying to fetch tweets for CoryGardner-slice1
trying to fetch tweets for CoryGardner-slice2
trying to fetch tweets for CoryGardner-slice3
return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z
trying to fetch tweets for CoryGardner-slice4
trying to fetch tweets for VP-slice1
trying to fetch tweets for VP-slice2
trying to fetch tweets for VP-slice3
trying to fetch tweets for VP-slice4
trying to fetch tweets for SenatorIsakson-slice1
trying to fetch tweets for SenatorIsakson-slice2
trying to fetch tweets f

KeyboardInterrupt: 