adds try except block for tweepy paginator

This commit is contained in:
Michael Beck 2023-06-07 19:37:01 +02:00
parent 632f504cc4
commit 0bc42fa862

View File

@ -11,6 +11,7 @@ import tweepy
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import glob import glob
import time
## Setup directories ## Setup directories
# WD Michael # WD Michael
@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
# Define time period of interest # Define time period of interest
start_time = '2020-01-01T00:00:00Z' # Define time periods of interest
end_time = '2023-01-03T00:00:00Z' time_slices = [
{
"start_time": "2020-01-01T00:00:00Z",
"end_time": "2020-06-01T00:00:00Z",
"suffix": "-slice1"
},
{
"start_time": "2020-06-01T00:00:01Z",
"end_time": "2021-01-01T00:00:00Z",
"suffix": "-slice2"
},
{
"start_time": "2021-01-01T00:00:01Z",
"end_time": "2021-06-01T00:00:00Z",
"suffix": "-slice3"
},
{
"start_time": "2021-06-01T00:00:01Z",
"end_time": "2023-01-03T00:00:00Z",
"suffix": "-slice4"
}
]
# gather keywords @chenTrackingSocialMedia2020 # gather keywords @chenTrackingSocialMedia2020
# line80 ff: lamsalCoronavirusCOVID19Tweets2020 # line80 ff: lamsalCoronavirusCOVID19Tweets2020
@ -51,23 +73,23 @@ with open("data/keywords.txt", "r") as file:
keywords.append(keyword) keywords.append(keyword)
tweet_fields = [ tweet_fields = [
"id", "id",
"text", "text",
"attachments", "attachments",
"author_id", "author_id",
"context_annotations", "context_annotations",
"conversation_id", "conversation_id",
"created_at", "created_at",
"entities", "entities",
"geo", "geo",
"lang", "lang",
"possibly_sensitive", "possibly_sensitive",
"public_metrics", "public_metrics",
"referenced_tweets", "referenced_tweets",
"reply_settings", "reply_settings",
"source", "source",
"withheld", "withheld",
] ]
# Get accounts & alt-accounts from Senators-Datafile # Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
@ -75,55 +97,87 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
print(accounts) print(accounts)
print(alt_accounts) print(alt_accounts)
# Iterate over each Twitter account
for handle in accounts: for handle in accounts:
query = "from:"+ handle +" -is:retweet" for slice_data in time_slices:
start_time = slice_data["start_time"]
end_time = slice_data["end_time"]
suffix = slice_data["suffix"]
tweetlist = [] query = "from:" + handle + " -is:retweet"
for tweet in tweepy.Paginator(client.search_all_tweets,
query=query,
tweet_fields = tweet_fields,
start_time=start_time,
end_time=end_time,
max_results=100).flatten(50):
tweetlist.append(tweet)
all_tweets = pd.DataFrame(tweetlist)
all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
all_tweets['handle'] = handle tweetlist = []
# Fetch tweets using Twitter API pagination
try:
for tweet in tweepy.Paginator(client.search_all_tweets,
query=query,
tweet_fields=tweet_fields,
start_time=start_time,
end_time=end_time,
max_results=100).flatten(50):
tweetlist.append(tweet)
msg = f"trying to fetch tweets for {handle}{suffix} fetched"
print(msg)
except tweepy.error.TweepError as ex:
timestamp = datetime.now().timestamp()
msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
print(msg)
time.sleep(1)
try:
for tweet in tweepy.Paginator(client.search_all_tweets,
query=query,
tweet_fields=tweet_fields,
start_time=start_time,
end_time=end_time,
max_results=100).flatten(50):
tweetlist.append(tweet)
msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
print(msg)
except tweepy.error.TweepError as ex:
timestamp = datetime.now().timestamp()
msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
print(msg)
time.sleep(1)
## Extract referenced_tweet info from column all_tweets = pd.DataFrame(tweetlist)
# Create empty columns to store the extracted information
all_tweets['referenced_tweet_type'] = None
all_tweets['referenced_tweet_id'] = None
# Iterate over each row # Check if no tweets fetched for the current time slice
for index, row in all_tweets.iterrows(): if len(tweetlist) == 0:
referenced_tweets = row['referenced_tweets'] msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
print(msg)
continue
# Check if referenced_tweets is not empty (array length > 0) all_tweets['handle'] = handle
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
referenced_tweet = referenced_tweets[0]
referenced_tweet_type = referenced_tweet['type']
referenced_tweet_id = referenced_tweet['id']
# Assign the extracted values to the new columns # Extract referenced_tweet info from column
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type all_tweets['referenced_tweet_type'] = None
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id all_tweets['referenced_tweet_id'] = None
## Check if tweet contains keyword if 'referenced_tweets' in all_tweets.columns:
# Create a new column to store the keyword match for index, row in all_tweets.iterrows():
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) referenced_tweets = row['referenced_tweets']
.str.join(',')
.replace('', 'none'))
## Save to versions of the dataset, one with all fields, one without dict fields if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
csv_path = td + handle + ".csv" referenced_tweet = referenced_tweets[0]
csv_path2 = td + handle + "-LONG.csv" referenced_tweet_type = referenced_tweet['type']
all_tweets.to_csv(csv_path2) referenced_tweet_id = referenced_tweet['id']
all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
all_tweets.to_csv(csv_path) all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
print("Fetched tweets for:") all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
print(handle)
# Check if tweet contains keyword
if 'text' in all_tweets.columns:
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
.str.join(',')
.replace('', 'none'))
# Save two versions of the dataset, one with all fields and one without dict fields
csv_path = f"data/tweets/{handle}{suffix}.csv"
csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
all_tweets.to_csv(csv_path2)
all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
all_tweets.to_csv(csv_path)
time.sleep(1) # sleep 1 second to not get over api limit
# Merge CSV-Files # Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
@ -137,12 +191,12 @@ print(tweetfiles)
df_all_senators = pd.DataFrame() df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame() df_all_senators_long = pd.DataFrame()
for file in tweetfiles: for file in tweetfiles:
if "LONG" in file: if "LONG" in file:
df = pd.read_csv(file) df = pd.read_csv(file)
df_all_senators_long = pd.concat([df, df_all_senators_long]) df_all_senators_long = pd.concat([df, df_all_senators_long])
else: else:
df = pd.read_csv(file) df = pd.read_csv(file)
df_all_senators = pd.concat([df, df_all_senators]) df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + "ALL-SENATORS.csv" csv_path = td + "ALL-SENATORS.csv"
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
df_all_senators.to_csv(csv_path) df_all_senators.to_csv(csv_path)