CollectUSSenatorTweets/collectSenData.py
2023-06-23 23:53:31 +02:00

167 lines
5.0 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Thu Jun 23 21:49:11 2023
@author: Michael
collectSenData.py scrapes accounts of senators for the following data:the
number of followers, the number of users the twitter account is following,
and how long the twitter account has existed.
# Requirements:
- snscrape 0.6.2.20230321+
- pandas 2.0+
# IMPORTANT:
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
included in 'snscrape/' as a git repository for better reproducibility. Earlier
versions of snscrape will most likely fail to scrape all tweets because of
certain rate limits or other errors that may occur.
Install snscrape from local git repo to make shure that it fits the used version.
If snscrape is shall be installed from local repo, uncomment the following lines:
import subprocess
os.chdir('snscrape/')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd)
# How to use:
"""
import os
import pandas as pd
import glob
import time
import sys
from datetime import datetime
import concurrent.futures
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
# Name of logfile
logfile = wd+"log/UserLog_"
###################
# Define Timespan & time-format
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping
no_slices = 24 # Number of slices / time periods.
# file time format
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
# get subparams just like in user where user id can be obtained by user.id
userDFColumns = [
"id",
"username",
"followersCount",
"friendsCount",
"verified",
"created"
]
#############################################################################
################## do NOT change anything below this line ###################
#############################################################################
from funs.Scrape import scrapeUsers, getHandles, printHandles
from funs.TimeSlice import convertTime
###################
# Create logfile & log all outputs
# there are three logfile types to be found in /log.
# should be self explanatory.
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w")
###################
# Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = getHandles(di)
# Print accounts to be scraped
print(printHandles(accounts))
###################
# Scraping
# report time:
timeStartScrape = datetime.now()
print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat))
print("---")
# Iterate over each Twitter account using multiprocessing
listUsers = []
# Iterate over each Twitter account using multiprocessing
with concurrent.futures.ProcessPoolExecutor() as executor:
# List to store the scraping tasks
tasks = []
for handle in accounts:
# Schedule the scraping task
task = executor.submit(
scrapeUsers, handle, userDFColumns
)
tasks.append(task)
# Wait for all tasks to complete and retrieve results
for task in concurrent.futures.as_completed(tasks):
result = task.result()
listUsers.append(result)
dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
dfUsers.to_csv(senCSVPath, encoding='utf-8')
# report time:
timeEndScrape = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat))
# Report timing info.
timeEndMerge = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat))
print("---")
# calulate times:
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
print(
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
)
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
print(listUsers)
# close connection to logfiles.
sys.stdout.close()
sys.stderr.close()