From 0bc42fa86221509f5c5fa8d512b3ca637d9b1f78 Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mischbeck.de>
Date: Wed, 7 Jun 2023 19:37:01 +0200
Subject: [PATCH] adds try except block for tweepy paginator

---
 collect.py | 200 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 127 insertions(+), 73 deletions(-)

diff --git a/collect.py b/collect.py
index 513a7d5..bb5db8d 100644
--- a/collect.py
+++ b/collect.py
@@ -11,6 +11,7 @@ import tweepy
 import pandas as pd
 import numpy as np
 import glob
+import time
 
 ## Setup directories
 # WD Michael
@@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT
 client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
 
 # Define time period of interest
-start_time = '2020-01-01T00:00:00Z'
-end_time = '2023-01-03T00:00:00Z'
+# Define time periods of interest
+time_slices = [
+    {
+        "start_time": "2020-01-01T00:00:00Z",
+        "end_time": "2020-06-01T00:00:00Z",
+        "suffix": "-slice1"
+    },
+    {
+        "start_time": "2020-06-01T00:00:01Z",
+        "end_time": "2021-01-01T00:00:00Z",
+        "suffix": "-slice2"
+    },
+    {
+        "start_time": "2021-01-01T00:00:01Z",
+        "end_time": "2021-06-01T00:00:00Z",
+        "suffix": "-slice3"
+    },
+    {
+        "start_time": "2021-06-01T00:00:01Z",
+        "end_time": "2023-01-03T00:00:00Z",
+        "suffix": "-slice4"
+    }
+]
 
 # gather keywords @chenTrackingSocialMedia2020
 # line80 ff:  lamsalCoronavirusCOVID19Tweets2020
@@ -51,23 +73,23 @@ with open("data/keywords.txt", "r") as file:
         keywords.append(keyword)
 
 tweet_fields = [
-	"id",
-	"text",
-	"attachments",
-	"author_id",
-	"context_annotations",
-	"conversation_id",
-	"created_at",
-	"entities",
-	"geo",
-	"lang",
-	"possibly_sensitive",
-	"public_metrics",
-	"referenced_tweets",
-	"reply_settings",
-	"source",
-	"withheld",
-	]
+    "id",
+    "text",
+    "attachments",
+    "author_id",
+    "context_annotations",
+    "conversation_id",
+    "created_at",
+    "entities",
+    "geo",
+    "lang",
+    "possibly_sensitive",
+    "public_metrics",
+    "referenced_tweets",
+    "reply_settings",
+    "source",
+    "withheld",
+    ]
 
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
@@ -75,55 +97,87 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 print(accounts)
 print(alt_accounts)
 
+# Iterate over each Twitter account
 for handle in accounts:
-	query = "from:"+ handle +" -is:retweet"
-	
-	tweetlist = []
-	for tweet in tweepy.Paginator(client.search_all_tweets, 
-								  query=query, 
-								  tweet_fields = tweet_fields,
-								  start_time=start_time, 
-								  end_time=end_time,
-								  max_results=100).flatten(50):
-		tweetlist.append(tweet)
-	all_tweets = pd.DataFrame(tweetlist)
-	all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
-	
-	all_tweets['handle'] = handle
-	
-	## Extract referenced_tweet info from column
-	# Create empty columns to store the extracted information
-	all_tweets['referenced_tweet_type'] = None
-	all_tweets['referenced_tweet_id'] = None
-	
-	# Iterate over each row
-	for index, row in all_tweets.iterrows():
-	    referenced_tweets = row['referenced_tweets']
-	    
-	    # Check if referenced_tweets is not empty (array length > 0)
-	    if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
-	        referenced_tweet = referenced_tweets[0]
-	        referenced_tweet_type = referenced_tweet['type']
-	        referenced_tweet_id = referenced_tweet['id']
-	        
-	        # Assign the extracted values to the new columns
-	        all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
-	        all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
-	
-	## Check if tweet contains keyword
-	# Create a new column to store the keyword match
-	all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
-	                                   .str.join(',')
-	                                   .replace('', 'none'))
-	
-	## Save to versions of the dataset, one with all fields, one without dict fields
-	csv_path = td + handle + ".csv"
-	csv_path2 = td + handle + "-LONG.csv"
-	all_tweets.to_csv(csv_path2)
-	all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
-	all_tweets.to_csv(csv_path)
-	print("Fetched tweets for:")
-	print(handle)
+    for slice_data in time_slices:
+        start_time = slice_data["start_time"]
+        end_time = slice_data["end_time"]
+        suffix = slice_data["suffix"]
+        
+        query = "from:" + handle + " -is:retweet"
+        
+        tweetlist = []
+        # Fetch tweets using Twitter API pagination
+        try:
+            for tweet in tweepy.Paginator(client.search_all_tweets,
+                                          query=query,
+                                          tweet_fields=tweet_fields,
+                                          start_time=start_time,
+                                          end_time=end_time,
+                                          max_results=100).flatten(50):
+                tweetlist.append(tweet)
+            msg = f"trying to fetch tweets for {handle}{suffix} fetched"
+            print(msg)
+        except tweepy.error.TweepError as ex:
+            timestamp = datetime.now().timestamp()
+            msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
+            print(msg)
+            time.sleep(1)
+            try:
+                for tweet in tweepy.Paginator(client.search_all_tweets,
+                                          query=query,
+                                          tweet_fields=tweet_fields,
+                                          start_time=start_time,
+                                          end_time=end_time,
+                                          max_results=100).flatten(50):
+                    tweetlist.append(tweet)
+                msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
+                print(msg)
+            except tweepy.error.TweepError as ex:
+                timestamp = datetime.now().timestamp()
+                msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
+                print(msg)
+                time.sleep(1)
+        
+        all_tweets = pd.DataFrame(tweetlist)
+        
+        # Check if no tweets fetched for the current time slice
+        if len(tweetlist) == 0:
+            msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
+            print(msg)
+            continue
+        
+        all_tweets['handle'] = handle
+        
+        # Extract referenced_tweet info from column
+        all_tweets['referenced_tweet_type'] = None
+        all_tweets['referenced_tweet_id'] = None
+        
+        if 'referenced_tweets' in all_tweets.columns:
+            for index, row in all_tweets.iterrows():
+                referenced_tweets = row['referenced_tweets']
+                
+                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
+                    referenced_tweet = referenced_tweets[0]
+                    referenced_tweet_type = referenced_tweet['type']
+                    referenced_tweet_id = referenced_tweet['id']
+                    
+                    all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
+                    all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
+        
+        # Check if tweet contains keyword
+        if 'text' in all_tweets.columns:
+            all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
+                                              .str.join(',')
+                                              .replace('', 'none'))
+        
+        # Save two versions of the dataset, one with all fields and one without dict fields
+        csv_path = f"data/tweets/{handle}{suffix}.csv"
+        csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
+        all_tweets.to_csv(csv_path2)
+        all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
+        all_tweets.to_csv(csv_path)
+        time.sleep(1) # sleep 1 second to not get over api limit
 
 # Merge CSV-Files
 # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
@@ -137,12 +191,12 @@ print(tweetfiles)
 df_all_senators = pd.DataFrame()
 df_all_senators_long = pd.DataFrame()
 for file in tweetfiles:
-	if "LONG" in file:
-		df = pd.read_csv(file)
-		df_all_senators_long = pd.concat([df, df_all_senators_long])
-	else:
-		df = pd.read_csv(file)
-		df_all_senators = pd.concat([df, df_all_senators])
+    if "LONG" in file:
+        df = pd.read_csv(file)
+        df_all_senators_long = pd.concat([df, df_all_senators_long])
+    else:
+        df = pd.read_csv(file)
+        df_all_senators = pd.concat([df, df_all_senators])
 csv_path = td + "ALL-SENATORS.csv"
 csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
 df_all_senators.to_csv(csv_path)