From 71e10a62d370c1a5561035b1161fb71ad72b3dc4 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 Jun 2023 23:53:31 +0200 Subject: [PATCH] adds senator data scraper --- collectSenData.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++ funs/Scrape.py | 53 ++++++++++++++- 2 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 collectSenData.py diff --git a/collectSenData.py b/collectSenData.py new file mode 100644 index 0000000..89b9574 --- /dev/null +++ b/collectSenData.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 23 21:49:11 2023 + +@author: Michael + +collectSenData.py scrapes accounts of senators for the following data:the +number of followers, the number of users the twitter account is following, +and how long the twitter account has existed. + +# Requirements: + - snscrape 0.6.2.20230321+ + - pandas 2.0+ +# IMPORTANT: +This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is +included in 'snscrape/' as a git repository for better reproducibility. Earlier +versions of snscrape will most likely fail to scrape all tweets because of +certain rate limits or other errors that may occur. +Install snscrape from local git repo to make shure that it fits the used version. +If snscrape is shall be installed from local repo, uncomment the following lines: + +import subprocess +os.chdir('snscrape/') +subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) +os.chdir(wd) + + +# How to use: +""" + +import os +import pandas as pd +import glob +import time +import sys +from datetime import datetime +import concurrent.futures + +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "ALL-SENATORS.csv" + +# don't change this one +senCSVPath = wd + ud + senCSV + +# Name of logfile +logfile = wd+"log/UserLog_" + +################### +# Define Timespan & time-format +# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) +ts_beg = "2020-01-01T00:00:00Z" # start of scraping +ts_end = "2023-01-03T00:00:00Z" # end of straping +no_slices = 24 # Number of slices / time periods. + +# file time format +fTimeFormat = "%Y-%m-%d_%H-%M-%S" + +# Maximum tweets to be scraped by snscrape. Can be left untouched. +maxTweets = 5000 + +# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: +# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html +# get subparams just like in user where user id can be obtained by user.id +userDFColumns = [ + "id", + "username", + "followersCount", + "friendsCount", + "verified", + "created" +] + +############################################################################# +################## do NOT change anything below this line ################### +############################################################################# + +from funs.Scrape import scrapeUsers, getHandles, printHandles +from funs.TimeSlice import convertTime + + +################### +# Create logfile & log all outputs +# there are three logfile types to be found in /log. +# should be self explanatory. +logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log" +logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log" +sys.stderr = open(logfileErrors, "w") +sys.stdout = open(logfilen, "w") + + +################### +# Senator Accounts +# Get accounts & alt-accounts from Senators-Datafile +accounts = getHandles(di) + +# Print accounts to be scraped +print(printHandles(accounts)) + +################### +# Scraping +# report time: +timeStartScrape = datetime.now() +print("Starting scraping at:") +print(timeStartScrape.strftime(fTimeFormat)) +print("---") + +# Iterate over each Twitter account using multiprocessing +listUsers = [] +# Iterate over each Twitter account using multiprocessing +with concurrent.futures.ProcessPoolExecutor() as executor: + # List to store the scraping tasks + tasks = [] + for handle in accounts: + # Schedule the scraping task + task = executor.submit( + scrapeUsers, handle, userDFColumns + ) + tasks.append(task) + + # Wait for all tasks to complete and retrieve results + for task in concurrent.futures.as_completed(tasks): + result = task.result() + listUsers.append(result) + +dfUsers = pd.DataFrame(listUsers, columns=userDFColumns) +dfUsers.to_csv(senCSVPath, encoding='utf-8') + +# report time: +timeEndScrape = datetime.now() +print("---") +print("End of scraping at:") +print(timeEndScrape.strftime(fTimeFormat)) + +# Report timing info. +timeEndMerge = datetime.now() +print("---") +print("End of scraping at:") +print(timeEndMerge.strftime(fTimeFormat)) +print("---") +# calulate times: +tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time +tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time +tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time +print( + f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" +) +print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") +print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") + +print(listUsers) +# close connection to logfiles. +sys.stdout.close() +sys.stderr.close() diff --git a/funs/Scrape.py b/funs/Scrape.py index 59e1a21..0b779a4 100644 --- a/funs/Scrape.py +++ b/funs/Scrape.py @@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, # save short csv tweet_df.to_csv(csv_path, encoding='utf-8') # sleep 1 second to not get blocked because of excessive requests - time.sleep(0.5) \ No newline at end of file + time.sleep(0.5) + +def getHandles(di): + """grabs accounts from senators-raw.csv + + Args: + di (str): path to senators-raw.csv + + Returns: + list: list containing str of senator account handles + """ + accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist() + alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist() + alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields + accounts.extend(alt_accounts) + return accounts + +def printHandles(accounts): + """returns string with all accounts in a readable way. + + Args: + accounts (list): list of str with handles + + Returns: + str: containing text that can be written to txtfile + """ + txt = ["Accounts to be scraped:\n"] + for i, acc in enumerate(accounts): # print 5 accounts per line + txt.append(f"{acc:^17}") # twitter handle max length = 15 chars + if i % 5 == 4: + txt.append(" \n") + txt.append(f"\n{i} accounts in total.") + return ''.join(txt) + +def scrapeUsers(handle, userDFColumns, maxTweets=1): + currentTime = datetime.now() + userList = [] + print(f'{currentTime:<30} Fetching: {handle:>15}') + query = f'from:{handle}' + + for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()): + if i > maxTweets: + break + # Get user data and append to singleUserList + userList = [] + for col in userDFColumns: + singleUser = eval(f'tweet.user.{col}') + userList.append(singleUser) + + # Create dataframe using userList and userDFColumns + #df = pd.DataFrame(userList, columns=userDFColumns) + return userList \ No newline at end of file