#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jun 6 11:40:07 2023 @author: michael """ import os import tweepy import pandas as pd import numpy as np import glob import time ## Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" # WD Josie # wd = "/home/michael/Documents/PS/Data/" # WD Sam # wd = "/home/michael/Documents/PS/Data/" # Tweet-datafile directory td = "data/tweets/" os.chdir(wd) ## Setup Api-connection bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) # Define time period of interest # Define time periods of interest time_slices = [ { "start_time": "2020-01-01T00:00:00Z", "end_time": "2020-06-01T00:00:00Z", "suffix": "-slice1" }, { "start_time": "2020-06-01T00:00:01Z", "end_time": "2021-01-01T00:00:00Z", "suffix": "-slice2" }, { "start_time": "2021-01-01T00:00:01Z", "end_time": "2021-06-01T00:00:00Z", "suffix": "-slice3" }, { "start_time": "2021-06-01T00:00:01Z", "end_time": "2023-01-03T00:00:00Z", "suffix": "-slice4" } ] # gather keywords @chenTrackingSocialMedia2020 # line80 ff: lamsalCoronavirusCOVID19Tweets2020 # Initialize the keywords list keywords = [] # Read the keywords from a file with open("data/keywords.txt", "r") as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) tweet_fields = [ "id", "text", "attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "lang", "possibly_sensitive", "public_metrics", "referenced_tweets", "reply_settings", "source", "withheld", ] # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() print(accounts) print(alt_accounts) # Iterate over each Twitter account for handle in accounts: for slice_data in time_slices: start_time = slice_data["start_time"] end_time = slice_data["end_time"] suffix = slice_data["suffix"] query = "from:" + handle + " -is:retweet" tweetlist = [] # Fetch tweets using Twitter API pagination try: for tweet in tweepy.Paginator(client.search_all_tweets, query=query, tweet_fields=tweet_fields, start_time=start_time, end_time=end_time, max_results=100).flatten(50): tweetlist.append(tweet) msg = f"trying to fetch tweets for {handle}{suffix} fetched" print(msg) except tweepy.error.TweepError as ex: timestamp = datetime.now().timestamp() msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..." print(msg) time.sleep(1) try: for tweet in tweepy.Paginator(client.search_all_tweets, query=query, tweet_fields=tweet_fields, start_time=start_time, end_time=end_time, max_results=100).flatten(50): tweetlist.append(tweet) msg = f"2nd try: tweets for {handle}{suffix} successfully fetched" print(msg) except tweepy.error.TweepError as ex: timestamp = datetime.now().timestamp() msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..." print(msg) time.sleep(1) all_tweets = pd.DataFrame(tweetlist) # Check if no tweets fetched for the current time slice if len(tweetlist) == 0: msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" print(msg) continue all_tweets['handle'] = handle # Extract referenced_tweet info from column all_tweets['referenced_tweet_type'] = None all_tweets['referenced_tweet_id'] = None if 'referenced_tweets' in all_tweets.columns: for index, row in all_tweets.iterrows(): referenced_tweets = row['referenced_tweets'] if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: referenced_tweet = referenced_tweets[0] referenced_tweet_type = referenced_tweet['type'] referenced_tweet_id = referenced_tweet['id'] all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id # Check if tweet contains keyword if 'text' in all_tweets.columns: all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) .str.join(',') .replace('', 'none')) # Save two versions of the dataset, one with all fields and one without dict fields csv_path = f"data/tweets/{handle}{suffix}.csv" csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" all_tweets.to_csv(csv_path2) all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) all_tweets.to_csv(csv_path) time.sleep(1) # sleep 1 second to not get over api limit # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) path_to_tweetdfs = wd + td os.chdir(path_to_tweetdfs) tweetfiles = glob.glob('*.{}'.format("csv")) print(tweetfiles) # save merged csv as two files df_all_senators = pd.DataFrame() df_all_senators_long = pd.DataFrame() for file in tweetfiles: if "LONG" in file: df = pd.read_csv(file) df_all_senators_long = pd.concat([df, df_all_senators_long]) else: df = pd.read_csv(file) df_all_senators = pd.concat([df, df_all_senators]) csv_path = td + "ALL-SENATORS.csv" csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" df_all_senators.to_csv(csv_path) df_all_senators_long.to_csv(csv_path2) os.chdir(wd)