#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jun 6 11:40:07 2023 @author: michael """ import os import tweepy import pandas as pd import numpy as np import glob ## Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" # WD Josie # wd = "/home/michael/Documents/PS/Data/" # WD Sam # wd = "/home/michael/Documents/PS/Data/" # Tweet-datafile directory td = "data/tweets/" os.chdir(wd) ## Setup Api-connection bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) # Define time period of interest start_time = '2020-01-01T00:00:00Z' end_time = '2023-01-03T00:00:00Z' # gather keywords @chenTrackingSocialMedia2020 # line80 ff: lamsalCoronavirusCOVID19Tweets2020 # Initialize the keywords list keywords = [] # Read the keywords from a file with open("data/keywords.txt", "r") as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) tweet_fields = [ "id", "text", "attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "lang", "possibly_sensitive", "public_metrics", "referenced_tweets", "reply_settings", "source", "withheld", ] # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() print(accounts) print(alt_accounts) for handle in accounts: query = "from:"+ handle +" -is:retweet" tweetlist = [] for tweet in tweepy.Paginator(client.search_all_tweets, query=query, tweet_fields = tweet_fields, start_time=start_time, end_time=end_time, max_results=100).flatten(50): tweetlist.append(tweet) all_tweets = pd.DataFrame(tweetlist) all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist())) all_tweets['handle'] = handle ## Extract referenced_tweet info from column # Create empty columns to store the extracted information all_tweets['referenced_tweet_type'] = None all_tweets['referenced_tweet_id'] = None # Iterate over each row for index, row in all_tweets.iterrows(): referenced_tweets = row['referenced_tweets'] # Check if referenced_tweets is not empty (array length > 0) if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: referenced_tweet = referenced_tweets[0] referenced_tweet_type = referenced_tweet['type'] referenced_tweet_id = referenced_tweet['id'] # Assign the extracted values to the new columns all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id ## Check if tweet contains keyword # Create a new column to store the keyword match all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) .str.join(',') .replace('', 'none')) ## Save to versions of the dataset, one with all fields, one without dict fields csv_path = td + handle + ".csv" csv_path2 = td + handle + "-LONG.csv" all_tweets.to_csv(csv_path2) all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1) all_tweets.to_csv(csv_path) print("Fetched tweets for:") print(handle) # Merge CSV-Files # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) path_to_tweetdfs = wd + td os.chdir(path_to_tweetdfs) tweetfiles = glob.glob('*.{}'.format("csv")) print(tweetfiles) # save merged csv as two files df_all_senators = pd.DataFrame() df_all_senators_long = pd.DataFrame() for file in tweetfiles: if "LONG" in file: df = pd.read_csv(file) df_all_senators_long = pd.concat([df, df_all_senators_long]) else: df = pd.read_csv(file) df_all_senators = pd.concat([df, df_all_senators]) csv_path = td + "ALL-SENATORS.csv" csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" df_all_senators.to_csv(csv_path) df_all_senators_long.to_csv(csv_path2) os.chdir(wd)