Restructures. adds TimeSlice, ClearDupes and more comments.

This commit is contained in:
Michael Beck
2023-06-21 19:07:07 +02:00
parent 2e70d960a5
commit ea7fcc732e
7 changed files with 539 additions and 325 deletions

View File

@@ -1,70 +1,79 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Jun 6 11:40:07 2023
Created on Thu Jun 8 01:08:21 2023
@author: michael
@author: Michael
Following files are necessary:
config.py
Used to configure everything that's needed for this script.
funs/TimeSlice.py
Function get_Tslices slices the defined timespan in config.py into N
slices. Is necessary due to possible blocking of requests by twitter.
The script will slepp for 1 second after each slice that was scraped.
funs/ClearDupes.py
Function deDupe reads each line of inFile and removes duplicate lines.
A file outFile is saved without the duplicate lines. Generates
"keywords.txt".
data/keywords-raw.txt
Contains all keywords that are used to detect whether a tweet contains
information about Covid19.
data/senators-raw.csv
Contains the senator dataset converted to csv. Is used to get the
account-names of all senators twitter accounts.
Requirements:
- snscrape 0.6.2.20230321+
- pandas 2.0+
The script will first import needed libraries.
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
included in 'snscrape/' as a git repository for better reproducibility. Earlier
versions of snscrape will most likely fail to scrape all tweets because of
certain rate limits or other errors that may occur.
config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.
How to use:
- To run the script, first adjust the config.py file.
- config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.
- run the script
- The whole script is expected to run without error messages except the
following:
'Stopping after 20 empty pages': indicates that no more tweets were found and
that the script skips to the next slice/account.
'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
found in that specific time range for that specific twitter account.
The script will scrape tweets for all senators in 'data/senators-raw.csv'
sliced in 6 time periods (to bypass twitters limitations). It will check whether
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
which is the final output.
'''
import os
import tweepy
import pandas as pd
import numpy as np
import glob
import time
## Setup directories
# WD Michael
wd = '/home/michael/Documents/PS/Data/collectTweets/'
## Import other files
from config import *
import snscrape.modules.twitter as sntwitter
from funs.TimeSlice import get_Tslices
from funs.ClearDupes import deDupe
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'
## Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices
print('Time-period-slices:')
for slice in time_slices:
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
# WD Josie
# wd = '/home/michael/Documents/PS/Data/'
# WD Sam
# wd = '/home/michael/Documents/PS/Data/'
# Tweet-datafile directory
td = 'data/tweets/'
os.chdir(wd)
## Setup Api-connection
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
# Define time period of interest
# Define time periods of interest
time_slices = [
{
'start_time': '2020-01-01T00:00:00Z',
'end_time': '2020-06-01T00:00:00Z',
'suffix': '-slice1'
},
{
'start_time': '2020-06-01T00:00:01Z',
'end_time': '2021-01-01T00:00:00Z',
'suffix': '-slice2'
},
{
'start_time': '2021-01-01T00:00:01Z',
'end_time': '2021-06-01T00:00:00Z',
'suffix': '-slice3'
},
{
'start_time': '2021-06-01T00:00:01Z',
'end_time': '2023-01-03T00:00:00Z',
'suffix': '-slice4'
}
]
# gather keywords @chenTrackingSocialMedia2020
# line80 ff: lamsalCoronavirusCOVID19Tweets2020
# Initialize the keywords list
## Keywords
keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
# Read the keywords from a file
with open('data/keywords.txt', 'r') as file:
lines = file.readlines()
@@ -72,42 +81,21 @@ with open('data/keywords.txt', 'r') as file:
keyword = line.strip() # Remove the newline character
keywords.append(keyword)
tweet_fields = [
'id',
'text',
'attachments',
'author_id',
'context_annotations',
'conversation_id',
'created_at',
'entities',
'geo',
'lang',
'possibly_sensitive',
'public_metrics',
'referenced_tweets',
'reply_settings',
'source',
'withheld',
]
## Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
print(accounts)
print(alt_accounts)
## Scraping
# Iterate over each Twitter account
for handle in accounts:
# Iterate over each time slice
for slice_data in time_slices:
# define slice data variables from time_slices
start_time = slice_data['start_time']
end_time = slice_data['end_time']
ts_beg = slice_data['beg_time']
ts_end = slice_data['end_time']
suffix = slice_data['suffix']
# define tweepy query with twitter handle of current sen
query = f'from:{handle} -is:retweet'
# create empty tweetlist that will be filled with tweets of current sen
tweetlist = []
@@ -115,121 +103,117 @@ for handle in accounts:
msg = f'trying to fetch tweets for {handle}{suffix}'
print(msg)
# Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism
max_attempts = 3 # maximum number of attempts to fetch tweets for a slice
attempt = 1
while attempt <= max_attempts:
try:
tweets = tweepy.Paginator(client.search_all_tweets,
query=query,
tweet_fields=tweet_fields,
start_time=start_time,
end_time=end_time,
max_results=20).flatten(20)
# for each tweet returned...
for tweet in tweets:
# ... add that tweet to tweetlist
tweetlist.append(tweet)
break # exit the retry loop if tweets are successfully fetched
except tweepy.TweepError as e:
# handle rate limit exceeded error
if e.response.status_code == 429:
# get the rate limit reset time from the response headers
reset_time = int(e.response.headers['x-rate-limit-reset'])
current_time = int(time.time())
# calculate the sleep time until the rate limit resets
sleep_time = reset_time - current_time + 1 # add an extra second
# sleep until the rate limit resets
time.sleep(sleep_time)
attempt += 1 # increment the attempt counter
continue # retry the API call
else:
# handle other types of Tweepy errors
print(f'Error occurred: {e}')
break
# Snscrape query:
query = f'from:{handle} since:{ts_beg} until:{ts_end}'
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
if i>maxTweets:
break
tweetlist.append([
tweet.id,
tweet.user.id,
tweet.user.username,
tweet.user.verified,
tweet.user.created,
tweet.user.favouritesCount,
tweet.user.followersCount,
tweet.user.friendsCount,
tweet.user.url,
tweet.rawContent,
tweet.renderedContent,
tweet.cashtags,
tweet.coordinates,
tweet.hashtags,
tweet.inReplyToTweetId,
tweet.inReplyToUser,
tweet.media,
tweet.mentionedUsers,
tweet.links,
tweet.place,
tweet.quotedTweet,
tweet.retweetedTweet,
tweet.sourceLabel,
tweet.sourceUrl,
tweet.url,
tweet.date,
tweet.replyCount,
tweet.retweetCount,
tweet.likeCount,
tweet.quoteCount,
tweet.conversationId,
tweet.lang,
tweet.source
])
# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
if len(tweetlist) == 0:
msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
print(msg)
continue
# convert to dataframe
tweet_df = pd.DataFrame(tweetlist)
# add handle column as api only provides user-ids
tweet_df['handle'] = handle
## Extract referenced_tweet info from column
tweet_df['referenced_tweet_type'] = None
tweet_df['referenced_tweet_id'] = None
# if cond. because in some cases column doesn't exist
if 'referenced_tweets' in tweet_df.columns:
for index, row in tweet_df.iterrows():
referenced_tweets = row['referenced_tweets']
if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
referenced_tweet = referenced_tweets[0]
referenced_tweet_type = referenced_tweet['type']
referenced_tweet_id = referenced_tweet['id']
tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
tweet_df = pd.DataFrame(tweetlist, columns=[
'id',
'user.id',
'user.username',
'user.verified',
'user.created',
'user.favouritesCount',
'user.followersCount',
'user.friendsCount',
'user.url',
'rawContent',
'renderedContent',
'cashtags',
'coordinates',
'hashtags',
'inReplyToTweetId',
'inReplyToUser',
'media',
'mentionedUsers',
'links',
'place',
'quotedTweet',
'retweetedTweet',
'sourceLabel',
'sourceUrl',
'url',
'date',
'replyCount',
'retweetCount',
'likeCount',
'quoteCount',
'conversationId',
'lang',
'source'])
## Check if tweet-text contains keyword
# if cond. because in some cases column doesn't exist
if 'text' in tweet_df.columns:
tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
tweet_df['contains_keyword'] = ''
tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
.str.join(',')
.replace('', 'none'))
## Save two versions of the dataset, one with all fields and one without dict fields
# define filepaths
csv_path = f'data/tweets/{handle}{suffix}.csv'
csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
# save LONG csv
tweet_df.to_csv(csv_path2)
# Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
# if cond. because in some cases column doesn't exist
if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
csv_path = f'data/tweets/T{handle}{suffix}.csv'
# save short csv
tweet_df.to_csv(csv_path)
# sleep 1 second to not exceed the API rate limit
# sleep 1 second to not get blocked because of excessive requests
time.sleep(1)
# Merge CSV-Files
# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
path_to_tweetdfs = wd + td
## Merge CSV-Files to file_alltweets
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format('csv'))
print(tweetfiles)
# save merged csv as two files
df_all_senators = pd.DataFrame()
df_all_senators_long = pd.DataFrame()
for file in tweetfiles:
if 'LONG' in file:
df = pd.read_csv(file)
df_all_senators_long = pd.concat([df, df_all_senators_long])
else:
df = pd.read_csv(file)
df_all_senators = pd.concat([df, df_all_senators])
csv_path = td + 'ALL-SENATORS.csv'
csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
df_all_senators.to_csv(csv_path)
df_all_senators_long.to_csv(csv_path2)
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
if file_alltweets in tweetfiles:
tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets
with open(file_alltweets,"wb") as fout:
# first file (because of the header):
with open(tweetfiles[0], "rb") as f:
fout.write(f.read())
# other files without the header:
for file in tweetfiles[1:]:
with open(file, "rb") as f:
next(f) # skip the header
fout.write(f.read())
os.chdir(wd)