comments and reorders
This commit is contained in:
parent
0bc42fa862
commit
81db25a8b8
198
collect.py
198
collect.py
@ -1,10 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
'''
|
||||||
Created on Tue Jun 6 11:40:07 2023
|
Created on Tue Jun 6 11:40:07 2023
|
||||||
|
|
||||||
@author: michael
|
@author: michael
|
||||||
"""
|
'''
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tweepy
|
import tweepy
|
||||||
@ -15,48 +15,48 @@ import time
|
|||||||
|
|
||||||
## Setup directories
|
## Setup directories
|
||||||
# WD Michael
|
# WD Michael
|
||||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
wd = '/home/michael/Documents/PS/Data/collectTweets/'
|
||||||
|
|
||||||
# WD Server
|
# WD Server
|
||||||
# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'
|
||||||
|
|
||||||
# WD Josie
|
# WD Josie
|
||||||
# wd = "/home/michael/Documents/PS/Data/"
|
# wd = '/home/michael/Documents/PS/Data/'
|
||||||
|
|
||||||
# WD Sam
|
# WD Sam
|
||||||
# wd = "/home/michael/Documents/PS/Data/"
|
# wd = '/home/michael/Documents/PS/Data/'
|
||||||
|
|
||||||
# Tweet-datafile directory
|
# Tweet-datafile directory
|
||||||
td = "data/tweets/"
|
td = 'data/tweets/'
|
||||||
|
|
||||||
os.chdir(wd)
|
os.chdir(wd)
|
||||||
|
|
||||||
## Setup Api-connection
|
## Setup Api-connection
|
||||||
bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
|
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
|
||||||
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
||||||
|
|
||||||
# Define time period of interest
|
# Define time period of interest
|
||||||
# Define time periods of interest
|
# Define time periods of interest
|
||||||
time_slices = [
|
time_slices = [
|
||||||
{
|
{
|
||||||
"start_time": "2020-01-01T00:00:00Z",
|
'start_time': '2020-01-01T00:00:00Z',
|
||||||
"end_time": "2020-06-01T00:00:00Z",
|
'end_time': '2020-06-01T00:00:00Z',
|
||||||
"suffix": "-slice1"
|
'suffix': '-slice1'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"start_time": "2020-06-01T00:00:01Z",
|
'start_time': '2020-06-01T00:00:01Z',
|
||||||
"end_time": "2021-01-01T00:00:00Z",
|
'end_time': '2021-01-01T00:00:00Z',
|
||||||
"suffix": "-slice2"
|
'suffix': '-slice2'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"start_time": "2021-01-01T00:00:01Z",
|
'start_time': '2021-01-01T00:00:01Z',
|
||||||
"end_time": "2021-06-01T00:00:00Z",
|
'end_time': '2021-06-01T00:00:00Z',
|
||||||
"suffix": "-slice3"
|
'suffix': '-slice3'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"start_time": "2021-06-01T00:00:01Z",
|
'start_time': '2021-06-01T00:00:01Z',
|
||||||
"end_time": "2023-01-03T00:00:00Z",
|
'end_time': '2023-01-03T00:00:00Z',
|
||||||
"suffix": "-slice4"
|
'suffix': '-slice4'
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -66,95 +66,87 @@ time_slices = [
|
|||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
# Read the keywords from a file
|
# Read the keywords from a file
|
||||||
with open("data/keywords.txt", "r") as file:
|
with open('data/keywords.txt', 'r') as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
keyword = line.strip() # Remove the newline character
|
keyword = line.strip() # Remove the newline character
|
||||||
keywords.append(keyword)
|
keywords.append(keyword)
|
||||||
|
|
||||||
tweet_fields = [
|
tweet_fields = [
|
||||||
"id",
|
'id',
|
||||||
"text",
|
'text',
|
||||||
"attachments",
|
'attachments',
|
||||||
"author_id",
|
'author_id',
|
||||||
"context_annotations",
|
'context_annotations',
|
||||||
"conversation_id",
|
'conversation_id',
|
||||||
"created_at",
|
'created_at',
|
||||||
"entities",
|
'entities',
|
||||||
"geo",
|
'geo',
|
||||||
"lang",
|
'lang',
|
||||||
"possibly_sensitive",
|
'possibly_sensitive',
|
||||||
"public_metrics",
|
'public_metrics',
|
||||||
"referenced_tweets",
|
'referenced_tweets',
|
||||||
"reply_settings",
|
'reply_settings',
|
||||||
"source",
|
'source',
|
||||||
"withheld",
|
'withheld',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Get accounts & alt-accounts from Senators-Datafile
|
# Get accounts & alt-accounts from Senators-Datafile
|
||||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
||||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
||||||
print(accounts)
|
print(accounts)
|
||||||
print(alt_accounts)
|
print(alt_accounts)
|
||||||
|
|
||||||
# Iterate over each Twitter account
|
# Iterate over each Twitter account
|
||||||
for handle in accounts:
|
for handle in accounts:
|
||||||
for slice_data in time_slices:
|
for slice_data in time_slices:
|
||||||
start_time = slice_data["start_time"]
|
# define slice data variables from time_slices
|
||||||
end_time = slice_data["end_time"]
|
start_time = slice_data['start_time']
|
||||||
suffix = slice_data["suffix"]
|
end_time = slice_data['end_time']
|
||||||
|
suffix = slice_data['suffix']
|
||||||
|
|
||||||
query = "from:" + handle + " -is:retweet"
|
# define tweepy query with twitter handle of current sen
|
||||||
|
query = f'from:{handle} -is:retweet'
|
||||||
|
|
||||||
|
# create empty tweetlist that will be filled with tweets of current sen
|
||||||
tweetlist = []
|
tweetlist = []
|
||||||
# Fetch tweets using Twitter API pagination
|
|
||||||
try:
|
|
||||||
for tweet in tweepy.Paginator(client.search_all_tweets,
|
|
||||||
query=query,
|
|
||||||
tweet_fields=tweet_fields,
|
|
||||||
start_time=start_time,
|
|
||||||
end_time=end_time,
|
|
||||||
max_results=100).flatten(50):
|
|
||||||
tweetlist.append(tweet)
|
|
||||||
msg = f"trying to fetch tweets for {handle}{suffix} fetched"
|
|
||||||
print(msg)
|
|
||||||
except tweepy.error.TweepError as ex:
|
|
||||||
timestamp = datetime.now().timestamp()
|
|
||||||
msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
|
|
||||||
print(msg)
|
|
||||||
time.sleep(1)
|
|
||||||
try:
|
|
||||||
for tweet in tweepy.Paginator(client.search_all_tweets,
|
|
||||||
query=query,
|
|
||||||
tweet_fields=tweet_fields,
|
|
||||||
start_time=start_time,
|
|
||||||
end_time=end_time,
|
|
||||||
max_results=100).flatten(50):
|
|
||||||
tweetlist.append(tweet)
|
|
||||||
msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
|
|
||||||
print(msg)
|
|
||||||
except tweepy.error.TweepError as ex:
|
|
||||||
timestamp = datetime.now().timestamp()
|
|
||||||
msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
|
|
||||||
print(msg)
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
all_tweets = pd.DataFrame(tweetlist)
|
# statusmsg
|
||||||
|
msg = f'trying to fetch tweets for {handle}{suffix}'
|
||||||
|
print(msg)
|
||||||
|
|
||||||
# Check if no tweets fetched for the current time slice
|
# Fetch tweets using tweepy Twitter API v2 pagination
|
||||||
|
tweets = tweepy.Paginator(client.search_all_tweets,
|
||||||
|
query=query,
|
||||||
|
tweet_fields=tweet_fields,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
max_results=20).flatten(20)
|
||||||
|
|
||||||
|
# for each tweet returned...
|
||||||
|
for tweet in tweets:
|
||||||
|
# ... add that tweet to tweetlist
|
||||||
|
tweetlist.append(tweet)
|
||||||
|
|
||||||
|
# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
|
||||||
if len(tweetlist) == 0:
|
if len(tweetlist) == 0:
|
||||||
msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
|
msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
|
||||||
print(msg)
|
print(msg)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
all_tweets['handle'] = handle
|
# convert to dataframe
|
||||||
|
tweet_df = pd.DataFrame(tweetlist)
|
||||||
|
|
||||||
# Extract referenced_tweet info from column
|
# add handle column as api only provides user-ids
|
||||||
all_tweets['referenced_tweet_type'] = None
|
tweet_df['handle'] = handle
|
||||||
all_tweets['referenced_tweet_id'] = None
|
|
||||||
|
|
||||||
if 'referenced_tweets' in all_tweets.columns:
|
## Extract referenced_tweet info from column
|
||||||
for index, row in all_tweets.iterrows():
|
tweet_df['referenced_tweet_type'] = None
|
||||||
|
tweet_df['referenced_tweet_id'] = None
|
||||||
|
|
||||||
|
# if cond. because in some cases column doesn't exist
|
||||||
|
if 'referenced_tweets' in tweet_df.columns:
|
||||||
|
for index, row in tweet_df.iterrows():
|
||||||
referenced_tweets = row['referenced_tweets']
|
referenced_tweets = row['referenced_tweets']
|
||||||
|
|
||||||
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
||||||
@ -162,28 +154,36 @@ for handle in accounts:
|
|||||||
referenced_tweet_type = referenced_tweet['type']
|
referenced_tweet_type = referenced_tweet['type']
|
||||||
referenced_tweet_id = referenced_tweet['id']
|
referenced_tweet_id = referenced_tweet['id']
|
||||||
|
|
||||||
all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
||||||
all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
||||||
|
|
||||||
# Check if tweet contains keyword
|
## Check if tweet-text contains keyword
|
||||||
if 'text' in all_tweets.columns:
|
# if cond. because in some cases column doesn't exist
|
||||||
all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
|
if 'text' in tweet_df.columns:
|
||||||
|
tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
|
||||||
.str.join(',')
|
.str.join(',')
|
||||||
.replace('', 'none'))
|
.replace('', 'none'))
|
||||||
|
|
||||||
# Save two versions of the dataset, one with all fields and one without dict fields
|
## Save two versions of the dataset, one with all fields and one without dict fields
|
||||||
csv_path = f"data/tweets/{handle}{suffix}.csv"
|
# define filepaths
|
||||||
csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
|
csv_path = f'data/tweets/{handle}{suffix}.csv'
|
||||||
all_tweets.to_csv(csv_path2)
|
csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
|
||||||
all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
|
# save LONG csv
|
||||||
all_tweets.to_csv(csv_path)
|
tweet_df.to_csv(csv_path2)
|
||||||
time.sleep(1) # sleep 1 second to not get over api limit
|
# Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
|
||||||
|
# if cond. because in some cases column doesn't exist
|
||||||
|
if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
|
||||||
|
tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
|
||||||
|
# save short csv
|
||||||
|
tweet_df.to_csv(csv_path)
|
||||||
|
# sleep 1 second to not get over 1sec api limit
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
# Merge CSV-Files
|
# Merge CSV-Files
|
||||||
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
||||||
path_to_tweetdfs = wd + td
|
path_to_tweetdfs = wd + td
|
||||||
os.chdir(path_to_tweetdfs)
|
os.chdir(path_to_tweetdfs)
|
||||||
tweetfiles = glob.glob('*.{}'.format("csv"))
|
tweetfiles = glob.glob('*.{}'.format('csv'))
|
||||||
|
|
||||||
print(tweetfiles)
|
print(tweetfiles)
|
||||||
|
|
||||||
@ -191,14 +191,14 @@ print(tweetfiles)
|
|||||||
df_all_senators = pd.DataFrame()
|
df_all_senators = pd.DataFrame()
|
||||||
df_all_senators_long = pd.DataFrame()
|
df_all_senators_long = pd.DataFrame()
|
||||||
for file in tweetfiles:
|
for file in tweetfiles:
|
||||||
if "LONG" in file:
|
if 'LONG' in file:
|
||||||
df = pd.read_csv(file)
|
df = pd.read_csv(file)
|
||||||
df_all_senators_long = pd.concat([df, df_all_senators_long])
|
df_all_senators_long = pd.concat([df, df_all_senators_long])
|
||||||
else:
|
else:
|
||||||
df = pd.read_csv(file)
|
df = pd.read_csv(file)
|
||||||
df_all_senators = pd.concat([df, df_all_senators])
|
df_all_senators = pd.concat([df, df_all_senators])
|
||||||
csv_path = td + "ALL-SENATORS.csv"
|
csv_path = td + 'ALL-SENATORS.csv'
|
||||||
csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
|
csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
|
||||||
df_all_senators.to_csv(csv_path)
|
df_all_senators.to_csv(csv_path)
|
||||||
df_all_senators_long.to_csv(csv_path2)
|
df_all_senators_long.to_csv(csv_path2)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user