151 lines
4.4 KiB
Python
151 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Jun 6 11:40:07 2023
|
|
|
|
@author: michael
|
|
"""
|
|
|
|
import os
|
|
import tweepy
|
|
import pandas as pd
|
|
import numpy as np
|
|
import glob
|
|
|
|
## Setup directories
|
|
# WD Michael
|
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
|
|
# WD Server
|
|
# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"
|
|
|
|
# WD Josie
|
|
# wd = "/home/michael/Documents/PS/Data/"
|
|
|
|
# WD Sam
|
|
# wd = "/home/michael/Documents/PS/Data/"
|
|
|
|
# Tweet-datafile directory
|
|
td = "data/tweets/"
|
|
|
|
os.chdir(wd)
|
|
|
|
## Setup Api-connection
|
|
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
|
|
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
|
|
|
# Define time period of interest
|
|
start_time = '2020-01-01T00:00:00Z'
|
|
end_time = '2023-01-03T00:00:00Z'
|
|
|
|
# gather keywords @chenTrackingSocialMedia2020
|
|
# line80 ff: lamsalCoronavirusCOVID19Tweets2020
|
|
# Initialize the keywords list
|
|
keywords = []
|
|
|
|
# Read the keywords from a file
|
|
with open("data/keywords.txt", "r") as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
keyword = line.strip() # Remove the newline character
|
|
keywords.append(keyword)
|
|
|
|
tweet_fields = [
|
|
"id",
|
|
"text",
|
|
"attachments",
|
|
"author_id",
|
|
"context_annotations",
|
|
"conversation_id",
|
|
"created_at",
|
|
"entities",
|
|
"geo",
|
|
"lang",
|
|
"possibly_sensitive",
|
|
"public_metrics",
|
|
"referenced_tweets",
|
|
"reply_settings",
|
|
"source",
|
|
"withheld",
|
|
]
|
|
|
|
# Get accounts & alt-accounts from Senators-Datafile
|
|
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
|
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
|
print(accounts)
|
|
print(alt_accounts)
|
|
|
|
for handle in accounts:
|
|
query = "from:"+ handle +" -is:retweet"
|
|
|
|
tweetlist = []
|
|
for tweet in tweepy.Paginator(client.search_all_tweets,
|
|
query=query,
|
|
tweet_fields = tweet_fields,
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
max_results=100).flatten(50):
|
|
tweetlist.append(tweet)
|
|
all_tweets = pd.DataFrame(tweetlist)
|
|
all_tweets = all_tweets.join(pd.DataFrame(all_tweets.pop("public_metrics").tolist()))
|
|
|
|
all_tweets['handle'] = handle
|
|
|
|
## Extract referenced_tweet info from column
|
|
# Create empty columns to store the extracted information
|
|
all_tweets['referenced_tweet_type'] = None
|
|
all_tweets['referenced_tweet_id'] = None
|
|
|
|
# Iterate over each row
|
|
for index, row in all_tweets.iterrows():
|
|
referenced_tweets = row['referenced_tweets']
|
|
|
|
# Check if referenced_tweets is not empty (array length > 0)
|
|
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
|
referenced_tweet = referenced_tweets[0]
|
|
referenced_tweet_type = referenced_tweet['type']
|
|
referenced_tweet_id = referenced_tweet['id']
|
|
|
|
# Assign the extracted values to the new columns
|
|
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
|
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
|
|
|
## Check if tweet contains keyword
|
|
# Create a new column to store the keyword match
|
|
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
|
|
.str.join(',')
|
|
.replace('', 'none'))
|
|
|
|
## Save to versions of the dataset, one with all fields, one without dict fields
|
|
csv_path = td + handle + ".csv"
|
|
csv_path2 = td + handle + "-LONG.csv"
|
|
all_tweets.to_csv(csv_path2)
|
|
all_tweets = all_tweets.drop(["context_annotations","entities","referenced_tweets"], axis=1)
|
|
all_tweets.to_csv(csv_path)
|
|
print("Fetched tweets for:")
|
|
print(handle)
|
|
|
|
# Merge CSV-Files
|
|
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
|
path_to_tweetdfs = wd + td
|
|
os.chdir(path_to_tweetdfs)
|
|
tweetfiles = glob.glob('*.{}'.format("csv"))
|
|
|
|
print(tweetfiles)
|
|
|
|
# save merged csv as two files
|
|
df_all_senators = pd.DataFrame()
|
|
df_all_senators_long = pd.DataFrame()
|
|
for file in tweetfiles:
|
|
if "LONG" in file:
|
|
df = pd.read_csv(file)
|
|
df_all_senators_long = pd.concat([df, df_all_senators_long])
|
|
else:
|
|
df = pd.read_csv(file)
|
|
df_all_senators = pd.concat([df, df_all_senators])
|
|
csv_path = td + "ALL-SENATORS.csv"
|
|
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
|
|
df_all_senators.to_csv(csv_path)
|
|
df_all_senators_long.to_csv(csv_path2)
|
|
|
|
os.chdir(wd) |