From 0bc42fa86221509f5c5fa8d512b3ca637d9b1f78 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Wed, 7 Jun 2023 19:37:01 +0200 Subject: [PATCH] adds try except block for tweepy paginator --- collect.py | 200 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 127 insertions(+), 73 deletions(-) diff --git a/collect.py b/collect.py index 513a7d5..bb5db8d 100644 --- a/collect.py +++ b/collect.py @@ -11,6 +11,7 @@ import tweepy import pandas as pd import numpy as np import glob +import time ## Setup directories # WD Michael @@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) # Define time period of interest -start_time = '2020-01-01T00:00:00Z' -end_time = '2023-01-03T00:00:00Z' +# Define time periods of interest +time_slices = [ + { + "start_time": "2020-01-01T00:00:00Z", + "end_time": "2020-06-01T00:00:00Z", + "suffix": "-slice1" + }, + { + "start_time": "2020-06-01T00:00:01Z", + "end_time": "2021-01-01T00:00:00Z", + "suffix": "-slice2" + }, + { + "start_time": "2021-01-01T00:00:01Z", + "end_time": "2021-06-01T00:00:00Z", + "suffix": "-slice3" + }, + { + "start_time": "2021-06-01T00:00:01Z", + "end_time": "2023-01-03T00:00:00Z", + "suffix": "-slice4" + } +] # gather keywords @chenTrackingSocialMedia2020 # line80 ff: lamsalCoronavirusCOVID19Tweets2020 @@ -51,23 +73,23 @@ with open("data/keywords.txt", "r") as file: keywords.append(keyword) tweet_fields = [ - "id", - "text", - "attachments", - "author_id", - "context_annotations", - "conversation_id", - "created_at", - "entities", - "geo", - "lang", - "possibly_sensitive", - "public_metrics", - "referenced_tweets", - "reply_settings", - "source", - "withheld", - ] + "id", + "text", + "attachments", + "author_id", + "context_annotations", + "conversation_id", + "created_at", + "entities", + "geo", + "lang", + "possibly_sensitive", + "public_metrics", + "referenced_tweets", + "reply_settings", + "source", + "withheld", + ] # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() @@ -75,55 +97,87 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() print(accounts) print(alt_accounts) +# Iterate over each Twitter account for handle in accounts: - query = "from:"+ handle +" -is:retweet" - - tweetlist = [] - for tweet in tweepy.Paginator(client.search_all_tweets, - query=query, - tweet_fields = tweet_fields, - start_time=start_time, - end_time=end_time, - max_results=100).flatten(50): - tweetlist.append(tweet) - all_tweets = pd.DataFrame(tweetlist) - all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist())) - - all_tweets['handle'] = handle - - ## Extract referenced_tweet info from column - # Create empty columns to store the extracted information - all_tweets['referenced_tweet_type'] = None - all_tweets['referenced_tweet_id'] = None - - # Iterate over each row - for index, row in all_tweets.iterrows(): - referenced_tweets = row['referenced_tweets'] - - # Check if referenced_tweets is not empty (array length > 0) - if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: - referenced_tweet = referenced_tweets[0] - referenced_tweet_type = referenced_tweet['type'] - referenced_tweet_id = referenced_tweet['id'] - - # Assign the extracted values to the new columns - all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type - all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id - - ## Check if tweet contains keyword - # Create a new column to store the keyword match - all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) - .str.join(',') - .replace('', 'none')) - - ## Save to versions of the dataset, one with all fields, one without dict fields - csv_path = td + handle + ".csv" - csv_path2 = td + handle + "-LONG.csv" - all_tweets.to_csv(csv_path2) - all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1) - all_tweets.to_csv(csv_path) - print("Fetched tweets for:") - print(handle) + for slice_data in time_slices: + start_time = slice_data["start_time"] + end_time = slice_data["end_time"] + suffix = slice_data["suffix"] + + query = "from:" + handle + " -is:retweet" + + tweetlist = [] + # Fetch tweets using Twitter API pagination + try: + for tweet in tweepy.Paginator(client.search_all_tweets, + query=query, + tweet_fields=tweet_fields, + start_time=start_time, + end_time=end_time, + max_results=100).flatten(50): + tweetlist.append(tweet) + msg = f"trying to fetch tweets for {handle}{suffix} fetched" + print(msg) + except tweepy.error.TweepError as ex: + timestamp = datetime.now().timestamp() + msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..." + print(msg) + time.sleep(1) + try: + for tweet in tweepy.Paginator(client.search_all_tweets, + query=query, + tweet_fields=tweet_fields, + start_time=start_time, + end_time=end_time, + max_results=100).flatten(50): + tweetlist.append(tweet) + msg = f"2nd try: tweets for {handle}{suffix} successfully fetched" + print(msg) + except tweepy.error.TweepError as ex: + timestamp = datetime.now().timestamp() + msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..." + print(msg) + time.sleep(1) + + all_tweets = pd.DataFrame(tweetlist) + + # Check if no tweets fetched for the current time slice + if len(tweetlist) == 0: + msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" + print(msg) + continue + + all_tweets['handle'] = handle + + # Extract referenced_tweet info from column + all_tweets['referenced_tweet_type'] = None + all_tweets['referenced_tweet_id'] = None + + if 'referenced_tweets' in all_tweets.columns: + for index, row in all_tweets.iterrows(): + referenced_tweets = row['referenced_tweets'] + + if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: + referenced_tweet = referenced_tweets[0] + referenced_tweet_type = referenced_tweet['type'] + referenced_tweet_id = referenced_tweet['id'] + + all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type + all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id + + # Check if tweet contains keyword + if 'text' in all_tweets.columns: + all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) + .str.join(',') + .replace('', 'none')) + + # Save two versions of the dataset, one with all fields and one without dict fields + csv_path = f"data/tweets/{handle}{suffix}.csv" + csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" + all_tweets.to_csv(csv_path2) + all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) + all_tweets.to_csv(csv_path) + time.sleep(1) # sleep 1 second to not get over api limit # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) @@ -137,12 +191,12 @@ print(tweetfiles) df_all_senators = pd.DataFrame() df_all_senators_long = pd.DataFrame() for file in tweetfiles: - if "LONG" in file: - df = pd.read_csv(file) - df_all_senators_long = pd.concat([df, df_all_senators_long]) - else: - df = pd.read_csv(file) - df_all_senators = pd.concat([df, df_all_senators]) + if "LONG" in file: + df = pd.read_csv(file) + df_all_senators_long = pd.concat([df, df_all_senators_long]) + else: + df = pd.read_csv(file) + df_all_senators = pd.concat([df, df_all_senators]) csv_path = td + "ALL-SENATORS.csv" csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" df_all_senators.to_csv(csv_path)