269 lines
9.7 KiB
Python
269 lines
9.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
'''
|
|
Created on Thu Jun 8 01:08:21 2023
|
|
|
|
@author: Michael
|
|
|
|
Following files are necessary:
|
|
config.py
|
|
Used to configure everything that's needed for this script.
|
|
funs/TimeSlice.py
|
|
Function get_Tslices slices the defined timespan in config.py into N
|
|
slices. Is necessary due to possible blocking of requests by twitter.
|
|
The script will slepp for 1 second after each slice that was scraped.
|
|
funs/ClearDupes.py
|
|
Function deDupe reads each line of inFile and removes duplicate lines.
|
|
A file outFile is saved without the duplicate lines. Generates
|
|
"keywords.txt".
|
|
data/keywords-raw.txt
|
|
Contains all keywords that are used to detect whether a tweet contains
|
|
information about Covid19.
|
|
data/senators-raw.csv
|
|
Contains the senator dataset converted to csv. Is used to get the
|
|
account-names of all senators twitter accounts.
|
|
|
|
Requirements:
|
|
- snscrape 0.6.2.20230321+
|
|
- pandas 2.0+
|
|
The script will first import needed libraries.
|
|
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
|
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
|
versions of snscrape will most likely fail to scrape all tweets because of
|
|
certain rate limits or other errors that may occur.
|
|
config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
|
|
How to use:
|
|
- To run the script, first adjust the config.py file.
|
|
- config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
- run the script
|
|
- The whole script is expected to run without error messages except the
|
|
following:
|
|
'Stopping after 20 empty pages': indicates that no more tweets were found and
|
|
that the script skips to the next slice/account.
|
|
'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
|
|
found in that specific time range for that specific twitter account.
|
|
|
|
The script will scrape tweets for all senators in 'data/senators-raw.csv'
|
|
sliced in 6 time periods (to bypass twitters limitations). It will check whether
|
|
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
|
|
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
|
|
which is the final output.
|
|
'''
|
|
|
|
import os
|
|
import pandas as pd
|
|
import glob
|
|
import time
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
## Setup directories
|
|
# WD Michael
|
|
wd = '/home/michael/Documents/PS/Data/collectTweets/'
|
|
# WD Server
|
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
|
|
# Tweet-datafile output directory
|
|
td = 'data/tweets/'
|
|
|
|
# Name of file that all tweets will be written to
|
|
file_alltweets = 'ALL-SENATORS-TWEETS.csv'
|
|
|
|
path_to_tweetdfs = wd + td
|
|
|
|
## Define Timespan
|
|
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
|
ts_beg = '2020-01-01T00:00:00Z' # start of scraping
|
|
ts_end = '2023-01-03T00:00:00Z' # end of straping
|
|
no_slices = 24 # Number of slices / time periods.
|
|
|
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
|
maxTweets = 5000
|
|
|
|
# Name of logfile
|
|
logfile = 'log/log_'
|
|
|
|
|
|
## Install snscrape from local git repo to make shure that it fits the used version.
|
|
# If snscrape is already installed, uncomment the following lines:
|
|
'''
|
|
import subprocess
|
|
os.chdir('snscrape/')
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
|
os.chdir(wd)
|
|
'''
|
|
|
|
# Columns for tweet dataframe
|
|
tweetDFColumns = [
|
|
'id',
|
|
'user.id',
|
|
'user.username',
|
|
'user.verified',
|
|
'user.created',
|
|
'user.favouritesCount',
|
|
'user.followersCount',
|
|
'user.friendsCount',
|
|
'user.url',
|
|
'rawContent',
|
|
'renderedContent',
|
|
'cashtags',
|
|
'coordinates',
|
|
'hashtags',
|
|
'inReplyToTweetId',
|
|
'inReplyToUser',
|
|
'media',
|
|
'mentionedUsers',
|
|
'links',
|
|
'place',
|
|
'quotedTweet',
|
|
'retweetedTweet',
|
|
'sourceLabel',
|
|
'sourceUrl',
|
|
'url',
|
|
'date',
|
|
'replyCount',
|
|
'retweetCount',
|
|
'likeCount',
|
|
'quoteCount',
|
|
'conversationId',
|
|
'lang',
|
|
'source']
|
|
|
|
## Import other files
|
|
import snscrape.modules.twitter as sntwitter
|
|
from funs.TimeSlice import *
|
|
from funs.ClearDupes import deDupe
|
|
|
|
# create logfile & log all outputs
|
|
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
|
|
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
|
|
sys.stderr = open(logfileErrors, 'w')
|
|
sys.stdout = open(logfilen, 'w')
|
|
|
|
## Create List of time-period-slices
|
|
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
|
# Print slices
|
|
print('Time-period-slices:')
|
|
for slice in time_slices:
|
|
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
|
print('---')
|
|
|
|
|
|
## Keywords
|
|
keywords = []
|
|
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
|
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
|
|
# Read the keywords from a file
|
|
with open('data/keywords.txt', 'r') as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
keyword = line.strip() # Remove the newline character
|
|
keywords.append(keyword)
|
|
print('---')
|
|
|
|
## Senator Accounts
|
|
# Get accounts & alt-accounts from Senators-Datafile
|
|
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
|
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
|
print('Accounts to be scraped:')
|
|
print(accounts)
|
|
print(alt_accounts)
|
|
print('---')
|
|
|
|
## Scraping
|
|
timeStartScrape = datetime.now()
|
|
print("Starting scraping at:")
|
|
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
|
print('---')
|
|
|
|
# Iterate over each Twitter account
|
|
for handle in accounts:
|
|
# Iterate over each time slice
|
|
for slice_data in time_slices:
|
|
# define slice data variables from time_slices
|
|
ts_beg = slice_data['beg_time']
|
|
ts_end = slice_data['end_time']
|
|
suffix = slice_data['suffix']
|
|
tweetFileName = "Tweets-{handle}{suffix}.csv"
|
|
|
|
# create empty tweetlist that will be filled with tweets of current sen
|
|
TweetList = []
|
|
|
|
# statusmsg
|
|
print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}')
|
|
|
|
# Snscrape query:
|
|
query = f'from:{handle} since:{ts_beg} until:{ts_end}'
|
|
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
|
singleTweetList = []
|
|
if i>maxTweets:
|
|
break
|
|
# get tweet vars from tweetDFColumns and append to singleTweetList
|
|
# which will then be appended to TweetList. TweetList contains all tweets of the current slice.
|
|
for col in tweetDFColumns:
|
|
singleTweetList.append(eval(f'tweet.{col}'))
|
|
TweetList.append(singleTweetList)
|
|
# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
|
|
if len(TweetList) == 0:
|
|
msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
|
|
open(file, 'a').close()
|
|
print(msg)
|
|
continue
|
|
|
|
print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}')
|
|
|
|
# convert to dataframe
|
|
tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns)
|
|
|
|
## Check if tweet-text contains keyword
|
|
tweet_df['contains_keyword'] = ''
|
|
tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none'))
|
|
## Save two versions of the dataset, one with all fields and one without dict fields
|
|
# define filepaths
|
|
csv_path = td + tweetFileName
|
|
# save short csv
|
|
tweet_df.to_csv(csv_path)
|
|
# sleep 1 second to not get blocked because of excessive requests
|
|
time.sleep(0.5)
|
|
|
|
timeEndScrape = datetime.now()tweetFileName
|
|
print("---")
|
|
print("End of scraping at:")
|
|
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
|
|
|
## Merge CSV-Files to file_alltweets
|
|
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
|
os.chdir(path_to_tweetdfs)
|
|
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
|
|
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
|
if file_alltweets in tweetfiles:
|
|
tweetfiles.remove(file_alltweets)
|
|
|
|
# Go through all csv files and merge them into file_alltweets
|
|
with open(file_alltweets,"wb") as fout:
|
|
# first file (because of the header):
|
|
with open(tweetfiles[0], "rb") as f:
|
|
fout.write(f.read())
|
|
# other files without the header:
|
|
for file in tweetfiles[1:]:
|
|
with open(file, "rb") as f:
|
|
next(f) # skip the header
|
|
fout.write(f.read())
|
|
os.chdir(wd)
|
|
|
|
timeEndMerge = datetime.now()
|
|
print("---")
|
|
print("End of scraping at:")
|
|
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
|
|
print("---")
|
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
|
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
|
|
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
|
|
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
|
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
|
|
|
sys.stdout.close()
|
|
sys.stderr.close() |