adds try except block for tweepy paginator
This commit is contained in:
parent
632f504cc4
commit
0bc42fa862
84
collect.py
84
collect.py
@ -11,6 +11,7 @@ import tweepy
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import glob
|
import glob
|
||||||
|
import time
|
||||||
|
|
||||||
## Setup directories
|
## Setup directories
|
||||||
# WD Michael
|
# WD Michael
|
||||||
@ -35,8 +36,29 @@ bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuT
|
|||||||
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
||||||
|
|
||||||
# Define time period of interest
|
# Define time period of interest
|
||||||
start_time = '2020-01-01T00:00:00Z'
|
# Define time periods of interest
|
||||||
end_time = '2023-01-03T00:00:00Z'
|
time_slices = [
|
||||||
|
{
|
||||||
|
"start_time": "2020-01-01T00:00:00Z",
|
||||||
|
"end_time": "2020-06-01T00:00:00Z",
|
||||||
|
"suffix": "-slice1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start_time": "2020-06-01T00:00:01Z",
|
||||||
|
"end_time": "2021-01-01T00:00:00Z",
|
||||||
|
"suffix": "-slice2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start_time": "2021-01-01T00:00:01Z",
|
||||||
|
"end_time": "2021-06-01T00:00:00Z",
|
||||||
|
"suffix": "-slice3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start_time": "2021-06-01T00:00:01Z",
|
||||||
|
"end_time": "2023-01-03T00:00:00Z",
|
||||||
|
"suffix": "-slice4"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
# gather keywords @chenTrackingSocialMedia2020
|
# gather keywords @chenTrackingSocialMedia2020
|
||||||
# line80 ff: lamsalCoronavirusCOVID19Tweets2020
|
# line80 ff: lamsalCoronavirusCOVID19Tweets2020
|
||||||
@ -75,10 +97,18 @@ alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
|||||||
print(accounts)
|
print(accounts)
|
||||||
print(alt_accounts)
|
print(alt_accounts)
|
||||||
|
|
||||||
|
# Iterate over each Twitter account
|
||||||
for handle in accounts:
|
for handle in accounts:
|
||||||
|
for slice_data in time_slices:
|
||||||
|
start_time = slice_data["start_time"]
|
||||||
|
end_time = slice_data["end_time"]
|
||||||
|
suffix = slice_data["suffix"]
|
||||||
|
|
||||||
query = "from:" + handle + " -is:retweet"
|
query = "from:" + handle + " -is:retweet"
|
||||||
|
|
||||||
tweetlist = []
|
tweetlist = []
|
||||||
|
# Fetch tweets using Twitter API pagination
|
||||||
|
try:
|
||||||
for tweet in tweepy.Paginator(client.search_all_tweets,
|
for tweet in tweepy.Paginator(client.search_all_tweets,
|
||||||
query=query,
|
query=query,
|
||||||
tweet_fields=tweet_fields,
|
tweet_fields=tweet_fields,
|
||||||
@ -86,44 +116,68 @@ for handle in accounts:
|
|||||||
end_time=end_time,
|
end_time=end_time,
|
||||||
max_results=100).flatten(50):
|
max_results=100).flatten(50):
|
||||||
tweetlist.append(tweet)
|
tweetlist.append(tweet)
|
||||||
|
msg = f"trying to fetch tweets for {handle}{suffix} fetched"
|
||||||
|
print(msg)
|
||||||
|
except tweepy.error.TweepError as ex:
|
||||||
|
timestamp = datetime.now().timestamp()
|
||||||
|
msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
|
||||||
|
print(msg)
|
||||||
|
time.sleep(1)
|
||||||
|
try:
|
||||||
|
for tweet in tweepy.Paginator(client.search_all_tweets,
|
||||||
|
query=query,
|
||||||
|
tweet_fields=tweet_fields,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
max_results=100).flatten(50):
|
||||||
|
tweetlist.append(tweet)
|
||||||
|
msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
|
||||||
|
print(msg)
|
||||||
|
except tweepy.error.TweepError as ex:
|
||||||
|
timestamp = datetime.now().timestamp()
|
||||||
|
msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
|
||||||
|
print(msg)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
all_tweets = pd.DataFrame(tweetlist)
|
all_tweets = pd.DataFrame(tweetlist)
|
||||||
all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
|
|
||||||
|
# Check if no tweets fetched for the current time slice
|
||||||
|
if len(tweetlist) == 0:
|
||||||
|
msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
|
||||||
|
print(msg)
|
||||||
|
continue
|
||||||
|
|
||||||
all_tweets['handle'] = handle
|
all_tweets['handle'] = handle
|
||||||
|
|
||||||
## Extract referenced_tweet info from column
|
# Extract referenced_tweet info from column
|
||||||
# Create empty columns to store the extracted information
|
|
||||||
all_tweets['referenced_tweet_type'] = None
|
all_tweets['referenced_tweet_type'] = None
|
||||||
all_tweets['referenced_tweet_id'] = None
|
all_tweets['referenced_tweet_id'] = None
|
||||||
|
|
||||||
# Iterate over each row
|
if 'referenced_tweets' in all_tweets.columns:
|
||||||
for index, row in all_tweets.iterrows():
|
for index, row in all_tweets.iterrows():
|
||||||
referenced_tweets = row['referenced_tweets']
|
referenced_tweets = row['referenced_tweets']
|
||||||
|
|
||||||
# Check if referenced_tweets is not empty (array length > 0)
|
|
||||||
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
||||||
referenced_tweet = referenced_tweets[0]
|
referenced_tweet = referenced_tweets[0]
|
||||||
referenced_tweet_type = referenced_tweet['type']
|
referenced_tweet_type = referenced_tweet['type']
|
||||||
referenced_tweet_id = referenced_tweet['id']
|
referenced_tweet_id = referenced_tweet['id']
|
||||||
|
|
||||||
# Assign the extracted values to the new columns
|
|
||||||
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
||||||
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
||||||
|
|
||||||
## Check if tweet contains keyword
|
# Check if tweet contains keyword
|
||||||
# Create a new column to store the keyword match
|
if 'text' in all_tweets.columns:
|
||||||
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
|
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
|
||||||
.str.join(',')
|
.str.join(',')
|
||||||
.replace('', 'none'))
|
.replace('', 'none'))
|
||||||
|
|
||||||
## Save to versions of the dataset, one with all fields, one without dict fields
|
# Save two versions of the dataset, one with all fields and one without dict fields
|
||||||
csv_path = td + handle + ".csv"
|
csv_path = f"data/tweets/{handle}{suffix}.csv"
|
||||||
csv_path2 = td + handle + "-LONG.csv"
|
csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
|
||||||
all_tweets.to_csv(csv_path2)
|
all_tweets.to_csv(csv_path2)
|
||||||
all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
|
all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
|
||||||
all_tweets.to_csv(csv_path)
|
all_tweets.to_csv(csv_path)
|
||||||
print("Fetched tweets for:")
|
time.sleep(1) # sleep 1 second to not get over api limit
|
||||||
print(handle)
|
|
||||||
|
|
||||||
# Merge CSV-Files
|
# Merge CSV-Files
|
||||||
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user