Restructures. adds TimeSlice, ClearDupes and more comments.
This commit is contained in:
342
collect.py
342
collect.py
@@ -1,70 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on Tue Jun 6 11:40:07 2023
|
||||
Created on Thu Jun 8 01:08:21 2023
|
||||
|
||||
@author: michael
|
||||
@author: Michael
|
||||
|
||||
Following files are necessary:
|
||||
config.py
|
||||
Used to configure everything that's needed for this script.
|
||||
funs/TimeSlice.py
|
||||
Function get_Tslices slices the defined timespan in config.py into N
|
||||
slices. Is necessary due to possible blocking of requests by twitter.
|
||||
The script will slepp for 1 second after each slice that was scraped.
|
||||
funs/ClearDupes.py
|
||||
Function deDupe reads each line of inFile and removes duplicate lines.
|
||||
A file outFile is saved without the duplicate lines. Generates
|
||||
"keywords.txt".
|
||||
data/keywords-raw.txt
|
||||
Contains all keywords that are used to detect whether a tweet contains
|
||||
information about Covid19.
|
||||
data/senators-raw.csv
|
||||
Contains the senator dataset converted to csv. Is used to get the
|
||||
account-names of all senators twitter accounts.
|
||||
|
||||
Requirements:
|
||||
- snscrape 0.6.2.20230321+
|
||||
- pandas 2.0+
|
||||
The script will first import needed libraries.
|
||||
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||
versions of snscrape will most likely fail to scrape all tweets because of
|
||||
certain rate limits or other errors that may occur.
|
||||
config.py will check whether snscrape is already installed. If not, it will try
|
||||
to install the included version automatically.
|
||||
|
||||
How to use:
|
||||
- To run the script, first adjust the config.py file.
|
||||
- config.py will check whether snscrape is already installed. If not, it will try
|
||||
to install the included version automatically.
|
||||
- run the script
|
||||
- The whole script is expected to run without error messages except the
|
||||
following:
|
||||
'Stopping after 20 empty pages': indicates that no more tweets were found and
|
||||
that the script skips to the next slice/account.
|
||||
'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
|
||||
found in that specific time range for that specific twitter account.
|
||||
|
||||
The script will scrape tweets for all senators in 'data/senators-raw.csv'
|
||||
sliced in 6 time periods (to bypass twitters limitations). It will check whether
|
||||
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
|
||||
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
|
||||
which is the final output.
|
||||
'''
|
||||
|
||||
import os
|
||||
import tweepy
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import glob
|
||||
import time
|
||||
|
||||
## Setup directories
|
||||
# WD Michael
|
||||
wd = '/home/michael/Documents/PS/Data/collectTweets/'
|
||||
## Import other files
|
||||
from config import *
|
||||
import snscrape.modules.twitter as sntwitter
|
||||
from funs.TimeSlice import get_Tslices
|
||||
from funs.ClearDupes import deDupe
|
||||
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'
|
||||
## Create List of time-period-slices
|
||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
||||
# Print slices
|
||||
print('Time-period-slices:')
|
||||
for slice in time_slices:
|
||||
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
||||
|
||||
# WD Josie
|
||||
# wd = '/home/michael/Documents/PS/Data/'
|
||||
|
||||
# WD Sam
|
||||
# wd = '/home/michael/Documents/PS/Data/'
|
||||
|
||||
# Tweet-datafile directory
|
||||
td = 'data/tweets/'
|
||||
|
||||
os.chdir(wd)
|
||||
|
||||
## Setup Api-connection
|
||||
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
|
||||
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
|
||||
|
||||
# Define time period of interest
|
||||
# Define time periods of interest
|
||||
time_slices = [
|
||||
{
|
||||
'start_time': '2020-01-01T00:00:00Z',
|
||||
'end_time': '2020-06-01T00:00:00Z',
|
||||
'suffix': '-slice1'
|
||||
},
|
||||
{
|
||||
'start_time': '2020-06-01T00:00:01Z',
|
||||
'end_time': '2021-01-01T00:00:00Z',
|
||||
'suffix': '-slice2'
|
||||
},
|
||||
{
|
||||
'start_time': '2021-01-01T00:00:01Z',
|
||||
'end_time': '2021-06-01T00:00:00Z',
|
||||
'suffix': '-slice3'
|
||||
},
|
||||
{
|
||||
'start_time': '2021-06-01T00:00:01Z',
|
||||
'end_time': '2023-01-03T00:00:00Z',
|
||||
'suffix': '-slice4'
|
||||
}
|
||||
]
|
||||
|
||||
# gather keywords @chenTrackingSocialMedia2020
|
||||
# line80 ff: lamsalCoronavirusCOVID19Tweets2020
|
||||
# Initialize the keywords list
|
||||
## Keywords
|
||||
keywords = []
|
||||
|
||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
|
||||
# Read the keywords from a file
|
||||
with open('data/keywords.txt', 'r') as file:
|
||||
lines = file.readlines()
|
||||
@@ -72,42 +81,21 @@ with open('data/keywords.txt', 'r') as file:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
|
||||
tweet_fields = [
|
||||
'id',
|
||||
'text',
|
||||
'attachments',
|
||||
'author_id',
|
||||
'context_annotations',
|
||||
'conversation_id',
|
||||
'created_at',
|
||||
'entities',
|
||||
'geo',
|
||||
'lang',
|
||||
'possibly_sensitive',
|
||||
'public_metrics',
|
||||
'referenced_tweets',
|
||||
'reply_settings',
|
||||
'source',
|
||||
'withheld',
|
||||
]
|
||||
|
||||
## Senator Accounts
|
||||
# Get accounts & alt-accounts from Senators-Datafile
|
||||
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
||||
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
||||
print(accounts)
|
||||
print(alt_accounts)
|
||||
|
||||
## Scraping
|
||||
# Iterate over each Twitter account
|
||||
for handle in accounts:
|
||||
# Iterate over each time slice
|
||||
for slice_data in time_slices:
|
||||
# define slice data variables from time_slices
|
||||
start_time = slice_data['start_time']
|
||||
end_time = slice_data['end_time']
|
||||
ts_beg = slice_data['beg_time']
|
||||
ts_end = slice_data['end_time']
|
||||
suffix = slice_data['suffix']
|
||||
|
||||
# define tweepy query with twitter handle of current sen
|
||||
query = f'from:{handle} -is:retweet'
|
||||
|
||||
# create empty tweetlist that will be filled with tweets of current sen
|
||||
tweetlist = []
|
||||
|
||||
@@ -115,121 +103,117 @@ for handle in accounts:
|
||||
msg = f'trying to fetch tweets for {handle}{suffix}'
|
||||
print(msg)
|
||||
|
||||
# Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism
|
||||
max_attempts = 3 # maximum number of attempts to fetch tweets for a slice
|
||||
attempt = 1
|
||||
|
||||
while attempt <= max_attempts:
|
||||
try:
|
||||
tweets = tweepy.Paginator(client.search_all_tweets,
|
||||
query=query,
|
||||
tweet_fields=tweet_fields,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
max_results=20).flatten(20)
|
||||
|
||||
# for each tweet returned...
|
||||
for tweet in tweets:
|
||||
# ... add that tweet to tweetlist
|
||||
tweetlist.append(tweet)
|
||||
|
||||
break # exit the retry loop if tweets are successfully fetched
|
||||
|
||||
except tweepy.TweepError as e:
|
||||
# handle rate limit exceeded error
|
||||
if e.response.status_code == 429:
|
||||
# get the rate limit reset time from the response headers
|
||||
reset_time = int(e.response.headers['x-rate-limit-reset'])
|
||||
current_time = int(time.time())
|
||||
|
||||
# calculate the sleep time until the rate limit resets
|
||||
sleep_time = reset_time - current_time + 1 # add an extra second
|
||||
|
||||
# sleep until the rate limit resets
|
||||
time.sleep(sleep_time)
|
||||
|
||||
attempt += 1 # increment the attempt counter
|
||||
continue # retry the API call
|
||||
|
||||
else:
|
||||
# handle other types of Tweepy errors
|
||||
print(f'Error occurred: {e}')
|
||||
break
|
||||
|
||||
# Snscrape query:
|
||||
query = f'from:{handle} since:{ts_beg} until:{ts_end}'
|
||||
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
||||
if i>maxTweets:
|
||||
break
|
||||
tweetlist.append([
|
||||
tweet.id,
|
||||
tweet.user.id,
|
||||
tweet.user.username,
|
||||
tweet.user.verified,
|
||||
tweet.user.created,
|
||||
tweet.user.favouritesCount,
|
||||
tweet.user.followersCount,
|
||||
tweet.user.friendsCount,
|
||||
tweet.user.url,
|
||||
tweet.rawContent,
|
||||
tweet.renderedContent,
|
||||
tweet.cashtags,
|
||||
tweet.coordinates,
|
||||
tweet.hashtags,
|
||||
tweet.inReplyToTweetId,
|
||||
tweet.inReplyToUser,
|
||||
tweet.media,
|
||||
tweet.mentionedUsers,
|
||||
tweet.links,
|
||||
tweet.place,
|
||||
tweet.quotedTweet,
|
||||
tweet.retweetedTweet,
|
||||
tweet.sourceLabel,
|
||||
tweet.sourceUrl,
|
||||
tweet.url,
|
||||
tweet.date,
|
||||
tweet.replyCount,
|
||||
tweet.retweetCount,
|
||||
tweet.likeCount,
|
||||
tweet.quoteCount,
|
||||
tweet.conversationId,
|
||||
tweet.lang,
|
||||
tweet.source
|
||||
])
|
||||
# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
|
||||
if len(tweetlist) == 0:
|
||||
msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
|
||||
msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
|
||||
print(msg)
|
||||
continue
|
||||
|
||||
# convert to dataframe
|
||||
tweet_df = pd.DataFrame(tweetlist)
|
||||
|
||||
# add handle column as api only provides user-ids
|
||||
tweet_df['handle'] = handle
|
||||
|
||||
## Extract referenced_tweet info from column
|
||||
tweet_df['referenced_tweet_type'] = None
|
||||
tweet_df['referenced_tweet_id'] = None
|
||||
|
||||
# if cond. because in some cases column doesn't exist
|
||||
if 'referenced_tweets' in tweet_df.columns:
|
||||
for index, row in tweet_df.iterrows():
|
||||
referenced_tweets = row['referenced_tweets']
|
||||
|
||||
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
|
||||
referenced_tweet = referenced_tweets[0]
|
||||
referenced_tweet_type = referenced_tweet['type']
|
||||
referenced_tweet_id = referenced_tweet['id']
|
||||
|
||||
tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
|
||||
tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
|
||||
tweet_df = pd.DataFrame(tweetlist, columns=[
|
||||
'id',
|
||||
'user.id',
|
||||
'user.username',
|
||||
'user.verified',
|
||||
'user.created',
|
||||
'user.favouritesCount',
|
||||
'user.followersCount',
|
||||
'user.friendsCount',
|
||||
'user.url',
|
||||
'rawContent',
|
||||
'renderedContent',
|
||||
'cashtags',
|
||||
'coordinates',
|
||||
'hashtags',
|
||||
'inReplyToTweetId',
|
||||
'inReplyToUser',
|
||||
'media',
|
||||
'mentionedUsers',
|
||||
'links',
|
||||
'place',
|
||||
'quotedTweet',
|
||||
'retweetedTweet',
|
||||
'sourceLabel',
|
||||
'sourceUrl',
|
||||
'url',
|
||||
'date',
|
||||
'replyCount',
|
||||
'retweetCount',
|
||||
'likeCount',
|
||||
'quoteCount',
|
||||
'conversationId',
|
||||
'lang',
|
||||
'source'])
|
||||
|
||||
## Check if tweet-text contains keyword
|
||||
# if cond. because in some cases column doesn't exist
|
||||
if 'text' in tweet_df.columns:
|
||||
tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
|
||||
tweet_df['contains_keyword'] = ''
|
||||
tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
|
||||
.str.join(',')
|
||||
.replace('', 'none'))
|
||||
|
||||
## Save two versions of the dataset, one with all fields and one without dict fields
|
||||
# define filepaths
|
||||
csv_path = f'data/tweets/{handle}{suffix}.csv'
|
||||
csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
|
||||
# save LONG csv
|
||||
tweet_df.to_csv(csv_path2)
|
||||
# Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
|
||||
# if cond. because in some cases column doesn't exist
|
||||
if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
|
||||
tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
|
||||
csv_path = f'data/tweets/T{handle}{suffix}.csv'
|
||||
# save short csv
|
||||
tweet_df.to_csv(csv_path)
|
||||
|
||||
# sleep 1 second to not exceed the API rate limit
|
||||
# sleep 1 second to not get blocked because of excessive requests
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
# Merge CSV-Files
|
||||
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
|
||||
path_to_tweetdfs = wd + td
|
||||
## Merge CSV-Files to file_alltweets
|
||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
||||
os.chdir(path_to_tweetdfs)
|
||||
tweetfiles = glob.glob('*.{}'.format('csv'))
|
||||
|
||||
print(tweetfiles)
|
||||
|
||||
# save merged csv as two files
|
||||
df_all_senators = pd.DataFrame()
|
||||
df_all_senators_long = pd.DataFrame()
|
||||
for file in tweetfiles:
|
||||
if 'LONG' in file:
|
||||
df = pd.read_csv(file)
|
||||
df_all_senators_long = pd.concat([df, df_all_senators_long])
|
||||
else:
|
||||
df = pd.read_csv(file)
|
||||
df_all_senators = pd.concat([df, df_all_senators])
|
||||
csv_path = td + 'ALL-SENATORS.csv'
|
||||
csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
|
||||
df_all_senators.to_csv(csv_path)
|
||||
df_all_senators_long.to_csv(csv_path2)
|
||||
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
|
||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
||||
if file_alltweets in tweetfiles:
|
||||
tweetfiles.remove(file_alltweets)
|
||||
|
||||
# Go through all csv files and merge them into file_alltweets
|
||||
with open(file_alltweets,"wb") as fout:
|
||||
# first file (because of the header):
|
||||
with open(tweetfiles[0], "rb") as f:
|
||||
fout.write(f.read())
|
||||
# other files without the header:
|
||||
for file in tweetfiles[1:]:
|
||||
with open(file, "rb") as f:
|
||||
next(f) # skip the header
|
||||
fout.write(f.read())
|
||||
os.chdir(wd)
|
||||
Reference in New Issue
Block a user