diff --git a/collect.py b/collect.py index 4c7af04..662da3e 100644 --- a/collect.py +++ b/collect.py @@ -115,18 +115,46 @@ for handle in accounts: msg = f'trying to fetch tweets for {handle}{suffix}' print(msg) - # Fetch tweets using tweepy Twitter API v2 pagination - tweets = tweepy.Paginator(client.search_all_tweets, - query=query, - tweet_fields=tweet_fields, - start_time=start_time, - end_time=end_time, - max_results=20).flatten(20) + # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism + max_attempts = 3 # maximum number of attempts to fetch tweets for a slice + attempt = 1 - # for each tweet returned... - for tweet in tweets: - # ... add that tweet to tweetlist - tweetlist.append(tweet) + while attempt <= max_attempts: + try: + tweets = tweepy.Paginator(client.search_all_tweets, + query=query, + tweet_fields=tweet_fields, + start_time=start_time, + end_time=end_time, + max_results=20).flatten(20) + + # for each tweet returned... + for tweet in tweets: + # ... add that tweet to tweetlist + tweetlist.append(tweet) + + break # exit the retry loop if tweets are successfully fetched + + except tweepy.TweepError as e: + # handle rate limit exceeded error + if e.response.status_code == 429: + # get the rate limit reset time from the response headers + reset_time = int(e.response.headers['x-rate-limit-reset']) + current_time = int(time.time()) + + # calculate the sleep time until the rate limit resets + sleep_time = reset_time - current_time + 1 # add an extra second + + # sleep until the rate limit resets + time.sleep(sleep_time) + + attempt += 1 # increment the attempt counter + continue # retry the API call + + else: + # handle other types of Tweepy errors + print(f'Error occurred: {e}') + break # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration if len(tweetlist) == 0: @@ -176,8 +204,10 @@ for handle in accounts: tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1) # save short csv tweet_df.to_csv(csv_path) - # sleep 1 second to not get over 1sec api limit - time.sleep(1) + + # sleep 1 second to not exceed the API rate limit + time.sleep(1) + # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)