CollectUSSenatorTweets/collect.py

# -*- coding: utf-8 -*-
"""
Created on Thu Jun  8 01:08:21 2023

@author: Michael

Following files are necessary:
    config.py
        Used to configure everything that's needed for this script.
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N
        slices. Is necessary due to possible blocking of requests by twitter.
        The script will slepp for 1 second after each slice that was scraped.
    funs/ClearDupes.py
        Function deDupe reads each line of inFile and removes duplicate lines.
        A file outFile is saved without the duplicate lines. Generates
        "keywords.txt".
    data/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
    data/senators-raw.csv
        Contains the senator dataset converted to csv. Is used to get the
        account-names of all senators twitter accounts.

Requirements:
    - snscrape 0.6.2.20230321+
    - pandas 2.0+
The script will first import needed libraries.
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
included in 'snscrape/' as a git repository for better reproducibility. Earlier
versions of snscrape will most likely fail to scrape all tweets because of
certain rate limits or other errors that may occur.
config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.

How to use:
- To run the script, first adjust the config.py file.
- config.py will check whether snscrape is already installed. If not, it will try
to install the included version automatically.
- run the script
- The whole script is expected to run without error messages except the
following:
    'Stopping after 20 empty pages': indicates that no more tweets were found and
        that the script skips to the next slice/account.
    'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
        found in that specific time range for that specific twitter account.

The script will scrape tweets for all senators in 'data/senators-raw.csv'
sliced in 6 time periods (to bypass twitters limitations). It will check whether
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
which is the final output.
"""

import os
import pandas as pd
import glob
import time
import sys
from datetime import datetime
import concurrent.futures

## Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

# Tweet-datafile output directory
td = "data/tweets/"

# Name of file that all tweets will be written to
file_alltweets = "ALL-SENATORS-TWEETS.csv"

path_to_tweetdfs = wd + td

## Define Timespan
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
ts_end = "2023-01-03T00:00:00Z"  # end of straping
no_slices = 24  # Number of slices / time periods.

# file time format
fTimeFormat = "%Y-%m-%d_%H-%M-%S"

# Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000

# Name of logfile
logfile = wd+"log/log_"

## Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines:
"""
import subprocess
os.chdir('snscrape/')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd)
"""

# Columns for tweet dataframe
tweetDFColumns = [
    "id",
    "user.id",
    "user.username",
    "user.verified",
    "user.created",
    "user.favouritesCount",
    "user.followersCount",
    "user.friendsCount",
    "user.url",
    "rawContent",
    "renderedContent",
    "cashtags",
    "coordinates",
    "hashtags",
    "inReplyToTweetId",
    "inReplyToUser",
    "media",
    "mentionedUsers",
    "links",
    "place",
    "quotedTweet",
    "retweetedTweet",
    "sourceLabel",
    "sourceUrl",
    "url",
    "date",
    "replyCount",
    "retweetCount",
    "likeCount",
    "quoteCount",
    "conversationId",
    "lang",
    "source",
]

## Import other files
from funs.TimeSlice import *
from funs.ClearDupes import deDupe
from funs.Scrape import scrapeTweets

# create logfile & log all outputs
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w")

## Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices
print("Time-period-slices:")
for slice in time_slices:
    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
print("---")

## Keywords
keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe("data/keywords-raw.txt", "data/keywords.txt")
# Read the keywords from a file
with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
print("---")

## Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
accounts.extend(alt_accounts)

# Print accounts to be scraped
print("Accounts to be scraped:")
for i, acc in enumerate(accounts): # print 5 accounts per line
    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
    if i % 5 == 4:
        print("\n")
print(f"\n{i} accounts in total.\n---")

## Scraping
timeStartScrape = datetime.now()
print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat))
print("---")

# Iterate over each Twitter account using multiprocessing
# Iterate over each Twitter account using multiprocessing
with concurrent.futures.ProcessPoolExecutor() as executor:
    # List to store the scraping tasks
    tasks = []
    for handle in accounts:
        # Iterate over each time slice
        for slice_data in time_slices:
            # ... Code to prepare the slice_data ...
            # Schedule the scraping task
            task = executor.submit(
                scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
            )
            tasks.append(task)
    # Wait for all tasks to complete
    concurrent.futures.wait(tasks)

timeEndScrape = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat))

## Merge CSV-Files to file_alltweets.
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
os.chdir(path_to_tweetdfs)
# At first check, whether all slices are present.
tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
AllFilesList = []
for handle in accounts:
    for tslice in time_slices:
        suffix = tslice['suffix']
        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
    for file in AllFilesList:
        if file not in tweetfiles:
            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets
if tweetfiles:
    with open(file_alltweets, "wb") as fout:
        # first file (because of the header):
        with open(tweetfiles[0], "rb") as f:
            fout.write(f.read())
        # other files without the header:
        for file in tweetfiles[1:]:
            with open(file, "rb") as f:
                next(f)  # skip the header
                fout.write(f.read())
os.chdir(wd)

timeEndMerge = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat))
print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
)
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")

sys.stdout.close()
sys.stderr.close()