219 lines
8.1 KiB
Python
219 lines
8.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
'''
|
|
Created on Thu Jun 8 01:08:21 2023
|
|
|
|
@author: Michael
|
|
|
|
Following files are necessary:
|
|
config.py
|
|
Used to configure everything that's needed for this script.
|
|
funs/TimeSlice.py
|
|
Function get_Tslices slices the defined timespan in config.py into N
|
|
slices. Is necessary due to possible blocking of requests by twitter.
|
|
The script will slepp for 1 second after each slice that was scraped.
|
|
funs/ClearDupes.py
|
|
Function deDupe reads each line of inFile and removes duplicate lines.
|
|
A file outFile is saved without the duplicate lines. Generates
|
|
"keywords.txt".
|
|
data/keywords-raw.txt
|
|
Contains all keywords that are used to detect whether a tweet contains
|
|
information about Covid19.
|
|
data/senators-raw.csv
|
|
Contains the senator dataset converted to csv. Is used to get the
|
|
account-names of all senators twitter accounts.
|
|
|
|
Requirements:
|
|
- snscrape 0.6.2.20230321+
|
|
- pandas 2.0+
|
|
The script will first import needed libraries.
|
|
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
|
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
|
versions of snscrape will most likely fail to scrape all tweets because of
|
|
certain rate limits or other errors that may occur.
|
|
config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
|
|
How to use:
|
|
- To run the script, first adjust the config.py file.
|
|
- config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
- run the script
|
|
- The whole script is expected to run without error messages except the
|
|
following:
|
|
'Stopping after 20 empty pages': indicates that no more tweets were found and
|
|
that the script skips to the next slice/account.
|
|
'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
|
|
found in that specific time range for that specific twitter account.
|
|
|
|
The script will scrape tweets for all senators in 'data/senators-raw.csv'
|
|
sliced in 6 time periods (to bypass twitters limitations). It will check whether
|
|
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
|
|
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
|
|
which is the final output.
|
|
'''
|
|
|
|
import os
|
|
import pandas as pd
|
|
import glob
|
|
import time
|
|
|
|
## Import other files
|
|
from config import *
|
|
import snscrape.modules.twitter as sntwitter
|
|
from funs.TimeSlice import get_Tslices
|
|
from funs.ClearDupes import deDupe
|
|
|
|
## Create List of time-period-slices
|
|
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
|
# Print slices
|
|
print('Time-period-slices:')
|
|
for slice in time_slices:
|
|
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
|
|
|
## Keywords
|
|
keywords = []
|
|
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
|
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
|
|
# Read the keywords from a file
|
|
with open('data/keywords.txt', 'r') as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
keyword = line.strip() # Remove the newline character
|
|
keywords.append(keyword)
|
|
|
|
## Senator Accounts
|
|
# Get accounts & alt-accounts from Senators-Datafile
|
|
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
|
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
|
|
|
## Scraping
|
|
# Iterate over each Twitter account
|
|
for handle in accounts:
|
|
# Iterate over each time slice
|
|
for slice_data in time_slices:
|
|
# define slice data variables from time_slices
|
|
ts_beg = slice_data['beg_time']
|
|
ts_end = slice_data['end_time']
|
|
suffix = slice_data['suffix']
|
|
|
|
# create empty tweetlist that will be filled with tweets of current sen
|
|
tweetlist = []
|
|
|
|
# statusmsg
|
|
msg = f'trying to fetch tweets for {handle}{suffix}'
|
|
print(msg)
|
|
|
|
# Snscrape query:
|
|
query = f'from:{handle} since:{ts_beg} until:{ts_end}'
|
|
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
|
if i>maxTweets:
|
|
break
|
|
tweetlist.append([
|
|
tweet.id,
|
|
tweet.user.id,
|
|
tweet.user.username,
|
|
tweet.user.verified,
|
|
tweet.user.created,
|
|
tweet.user.favouritesCount,
|
|
tweet.user.followersCount,
|
|
tweet.user.friendsCount,
|
|
tweet.user.url,
|
|
tweet.rawContent,
|
|
tweet.renderedContent,
|
|
tweet.cashtags,
|
|
tweet.coordinates,
|
|
tweet.hashtags,
|
|
tweet.inReplyToTweetId,
|
|
tweet.inReplyToUser,
|
|
tweet.media,
|
|
tweet.mentionedUsers,
|
|
tweet.links,
|
|
tweet.place,
|
|
tweet.quotedTweet,
|
|
tweet.retweetedTweet,
|
|
tweet.sourceLabel,
|
|
tweet.sourceUrl,
|
|
tweet.url,
|
|
tweet.date,
|
|
tweet.replyCount,
|
|
tweet.retweetCount,
|
|
tweet.likeCount,
|
|
tweet.quoteCount,
|
|
tweet.conversationId,
|
|
tweet.lang,
|
|
tweet.source
|
|
])
|
|
# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
|
|
if len(tweetlist) == 0:
|
|
msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
|
|
print(msg)
|
|
continue
|
|
|
|
# convert to dataframe
|
|
tweet_df = pd.DataFrame(tweetlist, columns=[
|
|
'id',
|
|
'user.id',
|
|
'user.username',
|
|
'user.verified',
|
|
'user.created',
|
|
'user.favouritesCount',
|
|
'user.followersCount',
|
|
'user.friendsCount',
|
|
'user.url',
|
|
'rawContent',
|
|
'renderedContent',
|
|
'cashtags',
|
|
'coordinates',
|
|
'hashtags',
|
|
'inReplyToTweetId',
|
|
'inReplyToUser',
|
|
'media',
|
|
'mentionedUsers',
|
|
'links',
|
|
'place',
|
|
'quotedTweet',
|
|
'retweetedTweet',
|
|
'sourceLabel',
|
|
'sourceUrl',
|
|
'url',
|
|
'date',
|
|
'replyCount',
|
|
'retweetCount',
|
|
'likeCount',
|
|
'quoteCount',
|
|
'conversationId',
|
|
'lang',
|
|
'source'])
|
|
|
|
## Check if tweet-text contains keyword
|
|
tweet_df['contains_keyword'] = ''
|
|
tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
|
|
.str.join(',')
|
|
.replace('', 'none'))
|
|
## Save two versions of the dataset, one with all fields and one without dict fields
|
|
# define filepaths
|
|
csv_path = f'data/tweets/T{handle}{suffix}.csv'
|
|
# save short csv
|
|
tweet_df.to_csv(csv_path)
|
|
# sleep 1 second to not get blocked because of excessive requests
|
|
time.sleep(1)
|
|
|
|
## Merge CSV-Files to file_alltweets
|
|
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
|
os.chdir(path_to_tweetdfs)
|
|
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
|
|
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
|
if file_alltweets in tweetfiles:
|
|
tweetfiles.remove(file_alltweets)
|
|
|
|
# Go through all csv files and merge them into file_alltweets
|
|
with open(file_alltweets,"wb") as fout:
|
|
# first file (because of the header):
|
|
with open(tweetfiles[0], "rb") as f:
|
|
fout.write(f.read())
|
|
# other files without the header:
|
|
for file in tweetfiles[1:]:
|
|
with open(file, "rb") as f:
|
|
next(f) # skip the header
|
|
fout.write(f.read())
|
|
os.chdir(wd) |