adds senator data scraper

This commit is contained in:
Michael Beck 2023-06-23 23:53:31 +02:00
parent 90d5501ec8
commit 71e10a62d3
2 changed files with 218 additions and 1 deletions

166
collectSenData.py Normal file
View File

@ -0,0 +1,166 @@
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 23 21:49:11 2023
@author: Michael
collectSenData.py scrapes accounts of senators for the following data:the
number of followers, the number of users the twitter account is following,
and how long the twitter account has existed.
# Requirements:
- snscrape 0.6.2.20230321+
- pandas 2.0+
# IMPORTANT:
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
included in 'snscrape/' as a git repository for better reproducibility. Earlier
versions of snscrape will most likely fail to scrape all tweets because of
certain rate limits or other errors that may occur.
Install snscrape from local git repo to make shure that it fits the used version.
If snscrape is shall be installed from local repo, uncomment the following lines:
import subprocess
os.chdir('snscrape/')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd)
# How to use:
"""
import os
import pandas as pd
import glob
import time
import sys
from datetime import datetime
import concurrent.futures
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
# Name of logfile
logfile = wd+"log/UserLog_"
###################
# Define Timespan & time-format
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping
no_slices = 24 # Number of slices / time periods.
# file time format
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
# get subparams just like in user where user id can be obtained by user.id
userDFColumns = [
"id",
"username",
"followersCount",
"friendsCount",
"verified",
"created"
]
#############################################################################
################## do NOT change anything below this line ###################
#############################################################################
from funs.Scrape import scrapeUsers, getHandles, printHandles
from funs.TimeSlice import convertTime
###################
# Create logfile & log all outputs
# there are three logfile types to be found in /log.
# should be self explanatory.
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w")
###################
# Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = getHandles(di)
# Print accounts to be scraped
print(printHandles(accounts))
###################
# Scraping
# report time:
timeStartScrape = datetime.now()
print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat))
print("---")
# Iterate over each Twitter account using multiprocessing
listUsers = []
# Iterate over each Twitter account using multiprocessing
with concurrent.futures.ProcessPoolExecutor() as executor:
# List to store the scraping tasks
tasks = []
for handle in accounts:
# Schedule the scraping task
task = executor.submit(
scrapeUsers, handle, userDFColumns
)
tasks.append(task)
# Wait for all tasks to complete and retrieve results
for task in concurrent.futures.as_completed(tasks):
result = task.result()
listUsers.append(result)
dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
dfUsers.to_csv(senCSVPath, encoding='utf-8')
# report time:
timeEndScrape = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat))
# Report timing info.
timeEndMerge = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat))
print("---")
# calulate times:
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
print(
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
)
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
print(listUsers)
# close connection to logfiles.
sys.stdout.close()
sys.stderr.close()

View File

@ -64,3 +64,54 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
tweet_df.to_csv(csv_path, encoding='utf-8')
# sleep 1 second to not get blocked because of excessive requests
time.sleep(0.5)
def getHandles(di):
"""grabs accounts from senators-raw.csv
Args:
di (str): path to senators-raw.csv
Returns:
list: list containing str of senator account handles
"""
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
accounts.extend(alt_accounts)
return accounts
def printHandles(accounts):
"""returns string with all accounts in a readable way.
Args:
accounts (list): list of str with handles
Returns:
str: containing text that can be written to txtfile
"""
txt = ["Accounts to be scraped:\n"]
for i, acc in enumerate(accounts): # print 5 accounts per line
txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
if i % 5 == 4:
txt.append(" \n")
txt.append(f"\n{i} accounts in total.")
return ''.join(txt)
def scrapeUsers(handle, userDFColumns, maxTweets=1):
currentTime = datetime.now()
userList = []
print(f'{currentTime:<30} Fetching: {handle:>15}')
query = f'from:{handle}'
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
if i > maxTweets:
break
# Get user data and append to singleUserList
userList = []
for col in userDFColumns:
singleUser = eval(f'tweet.user.{col}')
userList.append(singleUser)
# Create dataframe using userList and userDFColumns
#df = pd.DataFrame(userList, columns=userDFColumns)
return userList