diff --git a/collect.py b/collect.py index bb5db8d..4c7af04 100644 --- a/collect.py +++ b/collect.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" +''' Created on Tue Jun 6 11:40:07 2023 @author: michael -""" +''' import os import tweepy @@ -15,48 +15,48 @@ import time ## Setup directories # WD Michael -wd = "/home/michael/Documents/PS/Data/collectTweets/" +wd = '/home/michael/Documents/PS/Data/collectTweets/' # WD Server -# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection' # WD Josie -# wd = "/home/michael/Documents/PS/Data/" +# wd = '/home/michael/Documents/PS/Data/' # WD Sam -# wd = "/home/michael/Documents/PS/Data/" +# wd = '/home/michael/Documents/PS/Data/' # Tweet-datafile directory -td = "data/tweets/" +td = 'data/tweets/' os.chdir(wd) ## Setup Api-connection -bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" +bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc' client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) # Define time period of interest # Define time periods of interest time_slices = [ { - "start_time": "2020-01-01T00:00:00Z", - "end_time": "2020-06-01T00:00:00Z", - "suffix": "-slice1" + 'start_time': '2020-01-01T00:00:00Z', + 'end_time': '2020-06-01T00:00:00Z', + 'suffix': '-slice1' }, { - "start_time": "2020-06-01T00:00:01Z", - "end_time": "2021-01-01T00:00:00Z", - "suffix": "-slice2" + 'start_time': '2020-06-01T00:00:01Z', + 'end_time': '2021-01-01T00:00:00Z', + 'suffix': '-slice2' }, { - "start_time": "2021-01-01T00:00:01Z", - "end_time": "2021-06-01T00:00:00Z", - "suffix": "-slice3" + 'start_time': '2021-01-01T00:00:01Z', + 'end_time': '2021-06-01T00:00:00Z', + 'suffix': '-slice3' }, { - "start_time": "2021-06-01T00:00:01Z", - "end_time": "2023-01-03T00:00:00Z", - "suffix": "-slice4" + 'start_time': '2021-06-01T00:00:01Z', + 'end_time': '2023-01-03T00:00:00Z', + 'suffix': '-slice4' } ] @@ -66,95 +66,87 @@ time_slices = [ keywords = [] # Read the keywords from a file -with open("data/keywords.txt", "r") as file: +with open('data/keywords.txt', 'r') as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) tweet_fields = [ - "id", - "text", - "attachments", - "author_id", - "context_annotations", - "conversation_id", - "created_at", - "entities", - "geo", - "lang", - "possibly_sensitive", - "public_metrics", - "referenced_tweets", - "reply_settings", - "source", - "withheld", + 'id', + 'text', + 'attachments', + 'author_id', + 'context_annotations', + 'conversation_id', + 'created_at', + 'entities', + 'geo', + 'lang', + 'possibly_sensitive', + 'public_metrics', + 'referenced_tweets', + 'reply_settings', + 'source', + 'withheld', ] # Get accounts & alt-accounts from Senators-Datafile -accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() -alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() +accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() +alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() print(accounts) print(alt_accounts) # Iterate over each Twitter account for handle in accounts: for slice_data in time_slices: - start_time = slice_data["start_time"] - end_time = slice_data["end_time"] - suffix = slice_data["suffix"] + # define slice data variables from time_slices + start_time = slice_data['start_time'] + end_time = slice_data['end_time'] + suffix = slice_data['suffix'] - query = "from:" + handle + " -is:retweet" + # define tweepy query with twitter handle of current sen + query = f'from:{handle} -is:retweet' + # create empty tweetlist that will be filled with tweets of current sen tweetlist = [] - # Fetch tweets using Twitter API pagination - try: - for tweet in tweepy.Paginator(client.search_all_tweets, - query=query, - tweet_fields=tweet_fields, - start_time=start_time, - end_time=end_time, - max_results=100).flatten(50): - tweetlist.append(tweet) - msg = f"trying to fetch tweets for {handle}{suffix} fetched" - print(msg) - except tweepy.error.TweepError as ex: - timestamp = datetime.now().timestamp() - msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..." - print(msg) - time.sleep(1) - try: - for tweet in tweepy.Paginator(client.search_all_tweets, - query=query, - tweet_fields=tweet_fields, - start_time=start_time, - end_time=end_time, - max_results=100).flatten(50): - tweetlist.append(tweet) - msg = f"2nd try: tweets for {handle}{suffix} successfully fetched" - print(msg) - except tweepy.error.TweepError as ex: - timestamp = datetime.now().timestamp() - msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..." - print(msg) - time.sleep(1) - all_tweets = pd.DataFrame(tweetlist) + # statusmsg + msg = f'trying to fetch tweets for {handle}{suffix}' + print(msg) - # Check if no tweets fetched for the current time slice + # Fetch tweets using tweepy Twitter API v2 pagination + tweets = tweepy.Paginator(client.search_all_tweets, + query=query, + tweet_fields=tweet_fields, + start_time=start_time, + end_time=end_time, + max_results=20).flatten(20) + + # for each tweet returned... + for tweet in tweets: + # ... add that tweet to tweetlist + tweetlist.append(tweet) + + # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration if len(tweetlist) == 0: - msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" + msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}' print(msg) continue - all_tweets['handle'] = handle + # convert to dataframe + tweet_df = pd.DataFrame(tweetlist) - # Extract referenced_tweet info from column - all_tweets['referenced_tweet_type'] = None - all_tweets['referenced_tweet_id'] = None + # add handle column as api only provides user-ids + tweet_df['handle'] = handle - if 'referenced_tweets' in all_tweets.columns: - for index, row in all_tweets.iterrows(): + ## Extract referenced_tweet info from column + tweet_df['referenced_tweet_type'] = None + tweet_df['referenced_tweet_id'] = None + + # if cond. because in some cases column doesn't exist + if 'referenced_tweets' in tweet_df.columns: + for index, row in tweet_df.iterrows(): referenced_tweets = row['referenced_tweets'] if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: @@ -162,28 +154,36 @@ for handle in accounts: referenced_tweet_type = referenced_tweet['type'] referenced_tweet_id = referenced_tweet['id'] - all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type - all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id + tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type + tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id - # Check if tweet contains keyword - if 'text' in all_tweets.columns: - all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) + ## Check if tweet-text contains keyword + # if cond. because in some cases column doesn't exist + if 'text' in tweet_df.columns: + tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords)) .str.join(',') .replace('', 'none')) - # Save two versions of the dataset, one with all fields and one without dict fields - csv_path = f"data/tweets/{handle}{suffix}.csv" - csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" - all_tweets.to_csv(csv_path2) - all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) - all_tweets.to_csv(csv_path) - time.sleep(1) # sleep 1 second to not get over api limit + ## Save two versions of the dataset, one with all fields and one without dict fields + # define filepaths + csv_path = f'data/tweets/{handle}{suffix}.csv' + csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv' + # save LONG csv + tweet_df.to_csv(csv_path2) + # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files + # if cond. because in some cases column doesn't exist + if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')): + tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1) + # save short csv + tweet_df.to_csv(csv_path) + # sleep 1 second to not get over 1sec api limit + time.sleep(1) # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) path_to_tweetdfs = wd + td os.chdir(path_to_tweetdfs) -tweetfiles = glob.glob('*.{}'.format("csv")) +tweetfiles = glob.glob('*.{}'.format('csv')) print(tweetfiles) @@ -191,14 +191,14 @@ print(tweetfiles) df_all_senators = pd.DataFrame() df_all_senators_long = pd.DataFrame() for file in tweetfiles: - if "LONG" in file: + if 'LONG' in file: df = pd.read_csv(file) df_all_senators_long = pd.concat([df, df_all_senators_long]) else: df = pd.read_csv(file) df_all_senators = pd.concat([df, df_all_senators]) -csv_path = td + "ALL-SENATORS.csv" -csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" +csv_path = td + 'ALL-SENATORS.csv' +csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv' df_all_senators.to_csv(csv_path) df_all_senators_long.to_csv(csv_path2)