adds try except block for tweepy paginator

2023-06-07 19:37:01 +02:00
parent 632f504cc4
commit 0bc42fa862
1 changed files with 127 additions and 73 deletions
--- a/collect.py
+++ b/collect.py
@@ -11,6 +11,7 @@ import tweepy
 import pandas as pd
 import numpy as np
 import glob
 import time
 ## Setup directories
 # WD Michael
@@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT
 client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
 # Define time period of interest
-start_time = '2020-01-01T00:00:00Z'
+# Define time periods of interest
-end_time = '2023-01-03T00:00:00Z'
+time_slices = [
    {
        "start_time": "2020-01-01T00:00:00Z",
        "end_time": "2020-06-01T00:00:00Z",
        "suffix": "-slice1"
    },
    {
        "start_time": "2020-06-01T00:00:01Z",
        "end_time": "2021-01-01T00:00:00Z",
        "suffix": "-slice2"
    },
    {
        "start_time": "2021-01-01T00:00:01Z",
        "end_time": "2021-06-01T00:00:00Z",
        "suffix": "-slice3"
    },
    {
        "start_time": "2021-06-01T00:00:01Z",
        "end_time": "2023-01-03T00:00:00Z",
        "suffix": "-slice4"
    }
 ]
 # gather keywords @chenTrackingSocialMedia2020
 # line80 ff:  lamsalCoronavirusCOVID19Tweets2020
@@ -51,23 +73,23 @@ with open("data/keywords.txt", "r") as file:
        keywords.append(keyword)
 tweet_fields = [
-	"id",
+    "id",
-	"text",
+    "text",
-	"attachments",
+    "attachments",
-	"author_id",
+    "author_id",
-	"context_annotations",
+    "context_annotations",
-	"conversation_id",
+    "conversation_id",
-	"created_at",
+    "created_at",
-	"entities",
+    "entities",
-	"geo",
+    "geo",
-	"lang",
+    "lang",
-	"possibly_sensitive",
+    "possibly_sensitive",
-	"public_metrics",
+    "public_metrics",
-	"referenced_tweets",
+    "referenced_tweets",
-	"reply_settings",
+    "reply_settings",
-	"source",
+    "source",
-	"withheld",
+    "withheld",
-	]
+    ]
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
@@ -75,55 +97,87 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 print(accounts)
 print(alt_accounts)
 # Iterate over each Twitter account
 for handle in accounts:
-	query = "from:"+ handle +" -is:retweet"
+    for slice_data in time_slices:
        start_time = slice_data["start_time"]
        end_time = slice_data["end_time"]
        suffix = slice_data["suffix"]
-	tweetlist = []
+        query = "from:" + handle + " -is:retweet"
 	for tweet in tweepy.Paginator(client.search_all_tweets, 
 								  query=query, 
 								  tweet_fields = tweet_fields,
 								  start_time=start_time, 
 								  end_time=end_time,
 								  max_results=100).flatten(50):
 		tweetlist.append(tweet)
 	all_tweets = pd.DataFrame(tweetlist)
 	all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
-	all_tweets['handle'] = handle
+        tweetlist = []
        # Fetch tweets using Twitter API pagination
        try:
            for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                tweetlist.append(tweet)
            msg = f"trying to fetch tweets for {handle}{suffix} fetched"
            print(msg)
        except tweepy.error.TweepError as ex:
            timestamp = datetime.now().timestamp()
            msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
            print(msg)
            time.sleep(1)
            try:
                for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                    tweetlist.append(tweet)
                msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
                print(msg)
            except tweepy.error.TweepError as ex:
                timestamp = datetime.now().timestamp()
                msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
                print(msg)
                time.sleep(1)
-	## Extract referenced_tweet info from column
+        all_tweets = pd.DataFrame(tweetlist)
 	# Create empty columns to store the extracted information
 	all_tweets['referenced_tweet_type'] = None
 	all_tweets['referenced_tweet_id'] = None
-	# Iterate over each row
+        # Check if no tweets fetched for the current time slice
-	for index, row in all_tweets.iterrows():
+        if len(tweetlist) == 0:
-	    referenced_tweets = row['referenced_tweets']
+            msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
            print(msg)
            continue
-	    # Check if referenced_tweets is not empty (array length > 0)
+        all_tweets['handle'] = handle
 	    if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
 	        referenced_tweet = referenced_tweets[0]
 	        referenced_tweet_type = referenced_tweet['type']
 	        referenced_tweet_id = referenced_tweet['id']
-	        # Assign the extracted values to the new columns
+        # Extract referenced_tweet info from column
-	        all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
+        all_tweets['referenced_tweet_type'] = None
-	        all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
+        all_tweets['referenced_tweet_id'] = None
-	## Check if tweet contains keyword
+        if 'referenced_tweets' in all_tweets.columns:
-	# Create a new column to store the keyword match
+            for index, row in all_tweets.iterrows():
-	all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
+                referenced_tweets = row['referenced_tweets']
 	                                   .str.join(',')
 	                                   .replace('', 'none'))
-	## Save to versions of the dataset, one with all fields, one without dict fields
+                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
-	csv_path = td + handle + ".csv"
+                    referenced_tweet = referenced_tweets[0]
-	csv_path2 = td + handle + "-LONG.csv"
+                    referenced_tweet_type = referenced_tweet['type']
-	all_tweets.to_csv(csv_path2)
+                    referenced_tweet_id = referenced_tweet['id']
-	all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
+                    
-	all_tweets.to_csv(csv_path)
+                    all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
-	print("Fetched tweets for:")
+                    all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
-	print(handle)
+        
        # Check if tweet contains keyword
        if 'text' in all_tweets.columns:
            all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
        # Save two versions of the dataset, one with all fields and one without dict fields
        csv_path = f"data/tweets/{handle}{suffix}.csv"
        csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
        all_tweets.to_csv(csv_path2)
        all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
        all_tweets.to_csv(csv_path)
        time.sleep(1) # sleep 1 second to not get over api limit
 # Merge CSV-Files
 # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
@@ -137,12 +191,12 @@ print(tweetfiles)
 df_all_senators = pd.DataFrame()
 df_all_senators_long = pd.DataFrame()
 for file in tweetfiles:
-	if "LONG" in file:
+    if "LONG" in file:
-		df = pd.read_csv(file)
+        df = pd.read_csv(file)
-		df_all_senators_long = pd.concat([df, df_all_senators_long])
+        df_all_senators_long = pd.concat([df, df_all_senators_long])
-	else:
+    else:
-		df = pd.read_csv(file)
+        df = pd.read_csv(file)
-		df_all_senators = pd.concat([df, df_all_senators])
+        df_all_senators = pd.concat([df, df_all_senators])
 csv_path = td + "ALL-SENATORS.csv"
 csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
 df_all_senators.to_csv(csv_path)