CollectUSSenatorTweets/collect.py

# -*- coding: utf-8 -*-
'''
Created on Thu Jun  8 01:08:21 2023

@author: Michael

Following files are necessary:
    config.py
        Used to configure everything that's needed for this script.
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N
        slices. Is necessary due to possible blocking of requests by twitter.
        The script will slepp for 1 second after each slice that was scraped.
    funs/ClearDupes.py
        Function deDupe reads each line of inFile and removes duplicate lines.
        A file outFile is saved without the duplicate lines. Generates
        "keywords.txt".
    data/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
    data/senators-raw.csv
        Contains the senator dataset converted to csv. Is used to get the
        account-names of all senators twitter accounts.

Requirements:
    - snscrape 0.6.2.20230321+
    - pandas 2.0+
The script will first import needed libraries.
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
included in 'snscrape/' as a git repository for better reproducibility. Earlier
versions of snscrape will most likely fail to scrape all tweets because of
certain rate limits or other errors that may occur.
config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.

How to use:
- To run the script, first adjust the config.py file.
- config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.
- run the script
- The whole script is expected to run without error messages except the
following:
    'Stopping after 20 empty pages': indicates that no more tweets were found and
        that the script skips to the next slice/account.
    'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
        found in that specific time range for that specific twitter account.

The script will scrape tweets for all senators in 'data/senators-raw.csv'
sliced in 6 time periods (to bypass twitters limitations). It will check whether
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
which is the final output.
'''

import os
import pandas as pd
import glob
import time
import sys
from datetime import datetime

## Setup directories
# WD Michael
wd = '/home/michael/Documents/PS/Data/collectTweets/'
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

# Tweet-datafile output directory
td = 'data/tweets/'

# Name of file that all tweets will be written to
file_alltweets = 'ALL-SENATORS-TWEETS.csv'

path_to_tweetdfs = wd + td

## Define Timespan
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = '2020-01-01T00:00:00Z' # start of scraping
ts_end = '2023-01-03T00:00:00Z' # end of straping
no_slices = 24 # Number of slices / time periods.

# Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000

# Name of logfile
logfile = 'log/log_'


## Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines:
'''
import subprocess
os.chdir('snscrape/')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd)
'''

# Columns for tweet dataframe
tweetDFColumns = [
            'id',
            'user.id',
            'user.username',
            'user.verified',
            'user.created',
            'user.favouritesCount',
            'user.followersCount',
            'user.friendsCount',
            'user.url',
            'rawContent',
            'renderedContent',
            'cashtags',
            'coordinates',
            'hashtags',
            'inReplyToTweetId',
            'inReplyToUser',
            'media',
            'mentionedUsers',
            'links',
            'place',
            'quotedTweet',
            'retweetedTweet',
            'sourceLabel',
            'sourceUrl',
            'url',
            'date',
            'replyCount',
            'retweetCount',
            'likeCount',
            'quoteCount',
            'conversationId',
            'lang',
            'source']

## Import other files
import snscrape.modules.twitter as sntwitter
from funs.TimeSlice import *
from funs.ClearDupes import deDupe

# create logfile & log all outputs
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
sys.stderr = open(logfileErrors, 'w')
sys.stdout = open(logfilen, 'w')

## Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices
print('Time-period-slices:')
for slice in time_slices:
    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
print('---')


## Keywords
keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
# Read the keywords from a file
with open('data/keywords.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
print('---')

## Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
print('Accounts to be scraped:')
print(accounts)
print(alt_accounts)
print('---')

## Scraping
timeStartScrape = datetime.now()
print("Starting scraping at:")
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
print('---')

# Iterate over each Twitter account
for handle in accounts:
    # Iterate over each time slice
    for slice_data in time_slices:
        # define slice data variables from time_slices
        ts_beg = slice_data['beg_time']
        ts_end = slice_data['end_time']
        suffix = slice_data['suffix']
        tweetFileName = "Tweets-{handle}{suffix}.csv"

        # create empty tweetlist that will be filled with tweets of current sen
        TweetList = []

        # statusmsg
        print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}')

        # Snscrape query:
        query = f'from:{handle} since:{ts_beg} until:{ts_end}'
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
            singleTweetList = []
            if i>maxTweets:
                break
            # get tweet vars from tweetDFColumns and append to singleTweetList
            # which will then be appended to TweetList. TweetList contains all tweets of the current slice.
            for col in tweetDFColumns:
                singleTweetList.append(eval(f'tweet.{col}'))
            TweetList.append(singleTweetList)
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(TweetList) == 0:
            msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
            open(file, 'a').close()
            print(msg)
            continue

        print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}')

        # convert to dataframe
        tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns)

        ## Check if tweet-text contains keyword
        tweet_df['contains_keyword'] = ''
        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none'))
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
        csv_path = td + tweetFileName
        # save short csv
        tweet_df.to_csv(csv_path)
        # sleep 1 second to not get blocked because of excessive requests
        time.sleep(0.5)

timeEndScrape = datetime.now()tweetFileName
print("---")
print("End of scraping at:")
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))

## Merge CSV-Files to file_alltweets
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)

# Go through all csv files and merge them into file_alltweets
with open(file_alltweets,"wb") as fout:
    # first file (because of the header):
    with open(tweetfiles[0], "rb") as f:
        fout.write(f.read())
    # other files without the header:
    for file in tweetfiles[1:]:
        with open(file, "rb") as f:
            next(f) # skip the header
            fout.write(f.read())
os.chdir(wd)

timeEndMerge = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")

sys.stdout.close()
sys.stderr.close()