#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Jun 6 11:40:07 2023 @author: michael ''' import os import tweepy import pandas as pd import numpy as np import glob import time ## Setup directories # WD Michael wd = '/home/michael/Documents/PS/Data/collectTweets/' # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection' # WD Josie # wd = '/home/michael/Documents/PS/Data/' # WD Sam # wd = '/home/michael/Documents/PS/Data/' # Tweet-datafile directory td = 'data/tweets/' os.chdir(wd) ## Setup Api-connection bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc' client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) # Define time period of interest # Define time periods of interest time_slices = [ { 'start_time': '2020-01-01T00:00:00Z', 'end_time': '2020-06-01T00:00:00Z', 'suffix': '-slice1' }, { 'start_time': '2020-06-01T00:00:01Z', 'end_time': '2021-01-01T00:00:00Z', 'suffix': '-slice2' }, { 'start_time': '2021-01-01T00:00:01Z', 'end_time': '2021-06-01T00:00:00Z', 'suffix': '-slice3' }, { 'start_time': '2021-06-01T00:00:01Z', 'end_time': '2023-01-03T00:00:00Z', 'suffix': '-slice4' } ] # gather keywords @chenTrackingSocialMedia2020 # line80 ff: lamsalCoronavirusCOVID19Tweets2020 # Initialize the keywords list keywords = [] # Read the keywords from a file with open('data/keywords.txt', 'r') as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) tweet_fields = [ 'id', 'text', 'attachments', 'author_id', 'context_annotations', 'conversation_id', 'created_at', 'entities', 'geo', 'lang', 'possibly_sensitive', 'public_metrics', 'referenced_tweets', 'reply_settings', 'source', 'withheld', ] # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() print(accounts) print(alt_accounts) # Iterate over each Twitter account for handle in accounts: for slice_data in time_slices: # define slice data variables from time_slices start_time = slice_data['start_time'] end_time = slice_data['end_time'] suffix = slice_data['suffix'] # define tweepy query with twitter handle of current sen query = f'from:{handle} -is:retweet' # create empty tweetlist that will be filled with tweets of current sen tweetlist = [] # statusmsg msg = f'trying to fetch tweets for {handle}{suffix}' print(msg) # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism max_attempts = 3 # maximum number of attempts to fetch tweets for a slice attempt = 1 while attempt <= max_attempts: try: tweets = tweepy.Paginator(client.search_all_tweets, query=query, tweet_fields=tweet_fields, start_time=start_time, end_time=end_time, max_results=20).flatten(20) # for each tweet returned... for tweet in tweets: # ... add that tweet to tweetlist tweetlist.append(tweet) break # exit the retry loop if tweets are successfully fetched except tweepy.TweepError as e: # handle rate limit exceeded error if e.response.status_code == 429: # get the rate limit reset time from the response headers reset_time = int(e.response.headers['x-rate-limit-reset']) current_time = int(time.time()) # calculate the sleep time until the rate limit resets sleep_time = reset_time - current_time + 1 # add an extra second # sleep until the rate limit resets time.sleep(sleep_time) attempt += 1 # increment the attempt counter continue # retry the API call else: # handle other types of Tweepy errors print(f'Error occurred: {e}') break # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration if len(tweetlist) == 0: msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}' print(msg) continue # convert to dataframe tweet_df = pd.DataFrame(tweetlist) # add handle column as api only provides user-ids tweet_df['handle'] = handle ## Extract referenced_tweet info from column tweet_df['referenced_tweet_type'] = None tweet_df['referenced_tweet_id'] = None # if cond. because in some cases column doesn't exist if 'referenced_tweets' in tweet_df.columns: for index, row in tweet_df.iterrows(): referenced_tweets = row['referenced_tweets'] if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: referenced_tweet = referenced_tweets[0] referenced_tweet_type = referenced_tweet['type'] referenced_tweet_id = referenced_tweet['id'] tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id ## Check if tweet-text contains keyword # if cond. because in some cases column doesn't exist if 'text' in tweet_df.columns: tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords)) .str.join(',') .replace('', 'none')) ## Save two versions of the dataset, one with all fields and one without dict fields # define filepaths csv_path = f'data/tweets/{handle}{suffix}.csv' csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv' # save LONG csv tweet_df.to_csv(csv_path2) # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files # if cond. because in some cases column doesn't exist if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')): tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1) # save short csv tweet_df.to_csv(csv_path) # sleep 1 second to not exceed the API rate limit time.sleep(1) # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) path_to_tweetdfs = wd + td os.chdir(path_to_tweetdfs) tweetfiles = glob.glob('*.{}'.format('csv')) print(tweetfiles) # save merged csv as two files df_all_senators = pd.DataFrame() df_all_senators_long = pd.DataFrame() for file in tweetfiles: if 'LONG' in file: df = pd.read_csv(file) df_all_senators_long = pd.concat([df, df_all_senators_long]) else: df = pd.read_csv(file) df_all_senators = pd.concat([df, df_all_senators]) csv_path = td + 'ALL-SENATORS.csv' csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv' df_all_senators.to_csv(csv_path) df_all_senators_long.to_csv(csv_path2) os.chdir(wd)