comments and reorders

This commit is contained in:
Michael Beck 2023-06-07 20:36:35 +02:00
parent 0bc42fa862
commit 81db25a8b8

View File

@ -1,10 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" '''
Created on Tue Jun 6 11:40:07 2023 Created on Tue Jun 6 11:40:07 2023
@author: michael @author: michael
""" '''
import os import os
import tweepy import tweepy
@ -15,48 +15,48 @@ import time
## Setup directories ## Setup directories
# WD Michael # WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/" wd = '/home/michael/Documents/PS/Data/collectTweets/'
# WD Server # WD Server
# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'
# WD Josie # WD Josie
# wd = "/home/michael/Documents/PS/Data/" # wd = '/home/michael/Documents/PS/Data/'
# WD Sam # WD Sam
# wd = "/home/michael/Documents/PS/Data/" # wd = '/home/michael/Documents/PS/Data/'
# Tweet-datafile directory # Tweet-datafile directory
td = "data/tweets/" td = 'data/tweets/'
os.chdir(wd) os.chdir(wd)
## Setup Api-connection ## Setup Api-connection
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
# Define time period of interest # Define time period of interest
# Define time periods of interest # Define time periods of interest
time_slices = [ time_slices = [
{ {
"start_time": "2020-01-01T00:00:00Z", 'start_time': '2020-01-01T00:00:00Z',
"end_time": "2020-06-01T00:00:00Z", 'end_time': '2020-06-01T00:00:00Z',
"suffix": "-slice1" 'suffix': '-slice1'
}, },
{ {
"start_time": "2020-06-01T00:00:01Z", 'start_time': '2020-06-01T00:00:01Z',
"end_time": "2021-01-01T00:00:00Z", 'end_time': '2021-01-01T00:00:00Z',
"suffix": "-slice2" 'suffix': '-slice2'
}, },
{ {
"start_time": "2021-01-01T00:00:01Z", 'start_time': '2021-01-01T00:00:01Z',
"end_time": "2021-06-01T00:00:00Z", 'end_time': '2021-06-01T00:00:00Z',
"suffix": "-slice3" 'suffix': '-slice3'
}, },
{ {
"start_time": "2021-06-01T00:00:01Z", 'start_time': '2021-06-01T00:00:01Z',
"end_time": "2023-01-03T00:00:00Z", 'end_time': '2023-01-03T00:00:00Z',
"suffix": "-slice4" 'suffix': '-slice4'
} }
] ]
@ -66,95 +66,87 @@ time_slices = [
keywords = [] keywords = []
# Read the keywords from a file # Read the keywords from a file
with open("data/keywords.txt", "r") as file: with open('data/keywords.txt', 'r') as file:
lines = file.readlines() lines = file.readlines()
for line in lines: for line in lines:
keyword = line.strip() # Remove the newline character keyword = line.strip() # Remove the newline character
keywords.append(keyword) keywords.append(keyword)
tweet_fields = [ tweet_fields = [
"id", 'id',
"text", 'text',
"attachments", 'attachments',
"author_id", 'author_id',
"context_annotations", 'context_annotations',
"conversation_id", 'conversation_id',
"created_at", 'created_at',
"entities", 'entities',
"geo", 'geo',
"lang", 'lang',
"possibly_sensitive", 'possibly_sensitive',
"public_metrics", 'public_metrics',
"referenced_tweets", 'referenced_tweets',
"reply_settings", 'reply_settings',
"source", 'source',
"withheld", 'withheld',
] ]
# Get accounts & alt-accounts from Senators-Datafile # Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
print(accounts) print(accounts)
print(alt_accounts) print(alt_accounts)
# Iterate over each Twitter account # Iterate over each Twitter account
for handle in accounts: for handle in accounts:
for slice_data in time_slices: for slice_data in time_slices:
start_time = slice_data["start_time"] # define slice data variables from time_slices
end_time = slice_data["end_time"] start_time = slice_data['start_time']
suffix = slice_data["suffix"] end_time = slice_data['end_time']
suffix = slice_data['suffix']
query = "from:" + handle + " -is:retweet" # define tweepy query with twitter handle of current sen
query = f'from:{handle} -is:retweet'
# create empty tweetlist that will be filled with tweets of current sen
tweetlist = [] tweetlist = []
# Fetch tweets using Twitter API pagination
try: # statusmsg
for tweet in tweepy.Paginator(client.search_all_tweets, msg = f'trying to fetch tweets for {handle}{suffix}'
print(msg)
# Fetch tweets using tweepy Twitter API v2 pagination
tweets = tweepy.Paginator(client.search_all_tweets,
query=query, query=query,
tweet_fields=tweet_fields, tweet_fields=tweet_fields,
start_time=start_time, start_time=start_time,
end_time=end_time, end_time=end_time,
max_results=100).flatten(50): max_results=20).flatten(20)
tweetlist.append(tweet)
msg = f"trying to fetch tweets for {handle}{suffix} fetched"
print(msg)
except tweepy.error.TweepError as ex:
timestamp = datetime.now().timestamp()
msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
print(msg)
time.sleep(1)
try:
for tweet in tweepy.Paginator(client.search_all_tweets,
query=query,
tweet_fields=tweet_fields,
start_time=start_time,
end_time=end_time,
max_results=100).flatten(50):
tweetlist.append(tweet)
msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
print(msg)
except tweepy.error.TweepError as ex:
timestamp = datetime.now().timestamp()
msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
print(msg)
time.sleep(1)
all_tweets = pd.DataFrame(tweetlist) # for each tweet returned...
for tweet in tweets:
# ... add that tweet to tweetlist
tweetlist.append(tweet)
# Check if no tweets fetched for the current time slice # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
if len(tweetlist) == 0: if len(tweetlist) == 0:
msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
print(msg) print(msg)
continue continue
all_tweets['handle'] = handle # convert to dataframe
tweet_df = pd.DataFrame(tweetlist)
# Extract referenced_tweet info from column # add handle column as api only provides user-ids
all_tweets['referenced_tweet_type'] = None tweet_df['handle'] = handle
all_tweets['referenced_tweet_id'] = None
if 'referenced_tweets' in all_tweets.columns: ## Extract referenced_tweet info from column
for index, row in all_tweets.iterrows(): tweet_df['referenced_tweet_type'] = None
tweet_df['referenced_tweet_id'] = None
# if cond. because in some cases column doesn't exist
if 'referenced_tweets' in tweet_df.columns:
for index, row in tweet_df.iterrows():
referenced_tweets = row['referenced_tweets'] referenced_tweets = row['referenced_tweets']
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
@ -162,28 +154,36 @@ for handle in accounts:
referenced_tweet_type = referenced_tweet['type'] referenced_tweet_type = referenced_tweet['type']
referenced_tweet_id = referenced_tweet['id'] referenced_tweet_id = referenced_tweet['id']
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
# Check if tweet contains keyword ## Check if tweet-text contains keyword
if 'text' in all_tweets.columns: # if cond. because in some cases column doesn't exist
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) if 'text' in tweet_df.columns:
tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
.str.join(',') .str.join(',')
.replace('', 'none')) .replace('', 'none'))
# Save two versions of the dataset, one with all fields and one without dict fields ## Save two versions of the dataset, one with all fields and one without dict fields
csv_path = f"data/tweets/{handle}{suffix}.csv" # define filepaths
csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" csv_path = f'data/tweets/{handle}{suffix}.csv'
all_tweets.to_csv(csv_path2) csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) # save LONG csv
all_tweets.to_csv(csv_path) tweet_df.to_csv(csv_path2)
time.sleep(1) # sleep 1 second to not get over api limit # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
# if cond. because in some cases column doesn't exist
if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
# save short csv
tweet_df.to_csv(csv_path)
# sleep 1 second to not get over 1sec api limit
time.sleep(1)
# Merge CSV-Files # Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
path_to_tweetdfs = wd + td path_to_tweetdfs = wd + td
os.chdir(path_to_tweetdfs) os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format("csv")) tweetfiles = glob.glob('*.{}'.format('csv'))
print(tweetfiles) print(tweetfiles)
@ -191,14 +191,14 @@ print(tweetfiles)
df_all_senators = pd.DataFrame() df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame() df_all_senators_long = pd.DataFrame()
for file in tweetfiles: for file in tweetfiles:
if "LONG" in file: if 'LONG' in file:
df = pd.read_csv(file) df = pd.read_csv(file)
df_all_senators_long = pd.concat([df, df_all_senators_long]) df_all_senators_long = pd.concat([df, df_all_senators_long])
else: else:
df = pd.read_csv(file) df = pd.read_csv(file)
df_all_senators = pd.concat([df, df_all_senators]) df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + "ALL-SENATORS.csv" csv_path = td + 'ALL-SENATORS.csv'
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
df_all_senators.to_csv(csv_path) df_all_senators.to_csv(csv_path)
df_all_senators_long.to_csv(csv_path2) df_all_senators_long.to_csv(csv_path2)