adds try except block for tweepy paginator

2023-06-07 19:37:01 +02:00
parent 632f504cc4
commit 0bc42fa862
1 changed files with 127 additions and 73 deletions
--- a/collect.py
+++ b/collect.py
@@ -11,6 +11,7 @@ import tweepy
 import pandas as pd
 import numpy as np
 import glob
 import time
 ## Setup directories
 # WD Michael
@@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT
 client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
 # Define time period of interest
-start_time = '2020-01-01T00:00:00Z'
+# Define time periods of interest
-end_time = '2023-01-03T00:00:00Z'
+time_slices = [
    {
        "start_time": "2020-01-01T00:00:00Z",
        "end_time": "2020-06-01T00:00:00Z",
        "suffix": "-slice1"
    },
    {
        "start_time": "2020-06-01T00:00:01Z",
        "end_time": "2021-01-01T00:00:00Z",
        "suffix": "-slice2"
    },
    {
        "start_time": "2021-01-01T00:00:01Z",
        "end_time": "2021-06-01T00:00:00Z",
        "suffix": "-slice3"
    },
    {
        "start_time": "2021-06-01T00:00:01Z",
        "end_time": "2023-01-03T00:00:00Z",
        "suffix": "-slice4"
    }
 ]
 # gather keywords @chenTrackingSocialMedia2020
 # line80 ff:  lamsalCoronavirusCOVID19Tweets2020
@@ -75,10 +97,18 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 print(accounts)
 print(alt_accounts)
 # Iterate over each Twitter account
 for handle in accounts:
    for slice_data in time_slices:
        start_time = slice_data["start_time"]
        end_time = slice_data["end_time"]
        suffix = slice_data["suffix"]
        query = "from:" + handle + " -is:retweet"
        tweetlist = []
        # Fetch tweets using Twitter API pagination
        try:
            for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
@@ -86,44 +116,68 @@ for handle in accounts:
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                tweetlist.append(tweet)
            msg = f"trying to fetch tweets for {handle}{suffix} fetched"
            print(msg)
        except tweepy.error.TweepError as ex:
            timestamp = datetime.now().timestamp()
            msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
            print(msg)
            time.sleep(1)
            try:
                for tweet in tweepy.Paginator(client.search_all_tweets,
                                          query=query,
                                          tweet_fields=tweet_fields,
                                          start_time=start_time,
                                          end_time=end_time,
                                          max_results=100).flatten(50):
                    tweetlist.append(tweet)
                msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
                print(msg)
            except tweepy.error.TweepError as ex:
                timestamp = datetime.now().timestamp()
                msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
                print(msg)
                time.sleep(1)
        all_tweets = pd.DataFrame(tweetlist)
-	all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
+        
        # Check if no tweets fetched for the current time slice
        if len(tweetlist) == 0:
            msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
            print(msg)
            continue
        all_tweets['handle'] = handle
-	## Extract referenced_tweet info from column
+        # Extract referenced_tweet info from column
 	# Create empty columns to store the extracted information
        all_tweets['referenced_tweet_type'] = None
        all_tweets['referenced_tweet_id'] = None
-	# Iterate over each row
+        if 'referenced_tweets' in all_tweets.columns:
            for index, row in all_tweets.iterrows():
                referenced_tweets = row['referenced_tweets']
 	    # Check if referenced_tweets is not empty (array length > 0)
                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
                    referenced_tweet = referenced_tweets[0]
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']
 	        # Assign the extracted values to the new columns
                    all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
                    all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
-	## Check if tweet contains keyword
+        # Check if tweet contains keyword
-	# Create a new column to store the keyword match
+        if 'text' in all_tweets.columns:
            all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
-	## Save to versions of the dataset, one with all fields, one without dict fields
+        # Save two versions of the dataset, one with all fields and one without dict fields
-	csv_path = td + handle + ".csv"
+        csv_path = f"data/tweets/{handle}{suffix}.csv"
-	csv_path2 = td + handle + "-LONG.csv"
+        csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
        all_tweets.to_csv(csv_path2)
        all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
        all_tweets.to_csv(csv_path)
-	print("Fetched tweets for:")
+        time.sleep(1) # sleep 1 second to not get over api limit
 	print(handle)
 # Merge CSV-Files
 # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)