adds senator data scraper

2023-06-23 23:53:31 +02:00
parent 90d5501ec8
commit 71e10a62d3
2 changed files with 218 additions and 1 deletions
--- a/collectSenData.py
+++ b/collectSenData.py
@@ -0,0 +1,166 @@
 # -*- coding: utf-8 -*-
 """
 Created on Thu Jun  23 21:49:11 2023
@author: Michael
 collectSenData.py scrapes accounts of senators for the following data:the 
 number of followers, the number of users the twitter account is following, 
 and how long the twitter account has existed.
 # Requirements:
    - snscrape 0.6.2.20230321+
    - pandas 2.0+
 # IMPORTANT:
 This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
 included in 'snscrape/' as a git repository for better reproducibility. Earlier
 versions of snscrape will most likely fail to scrape all tweets because of 
 certain rate limits or other errors that may occur.
 Install snscrape from local git repo to make shure that it fits the used version.
 If snscrape is shall be installed from local repo, uncomment the following lines:
 import subprocess
 os.chdir('snscrape/')
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
 # How to use:
 """
 import os
 import pandas as pd
 import glob
 import time
 import sys
 from datetime import datetime
 import concurrent.futures
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "ALL-SENATORS.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 # Name of logfile
 logfile = wd+"log/UserLog_"
 ###################
 # Define Timespan & time-format
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
 no_slices = 24  # Number of slices / time periods.
 # file time format
 fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
 # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
 #   get subparams just like in user where user id can be obtained by user.id 
 userDFColumns = [
    "id",
    "username",
    "followersCount",
    "friendsCount",
    "verified",
    "created"
 ]
 #############################################################################
 ################## do NOT change anything below this line ###################
 #############################################################################
 from funs.Scrape import scrapeUsers, getHandles, printHandles
 from funs.TimeSlice import convertTime
 ################### 
 # Create logfile & log all outputs
 #   there are three logfile types to be found in /log.
 #   should be self explanatory.
 logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
 logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")
 ###################
 # Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = getHandles(di)
 # Print accounts to be scraped
 print(printHandles(accounts))
 ###################
 # Scraping
 # report time:
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
 print(timeStartScrape.strftime(fTimeFormat))
 print("---")
 # Iterate over each Twitter account using multiprocessing
 listUsers = []
 # Iterate over each Twitter account using multiprocessing
 with concurrent.futures.ProcessPoolExecutor() as executor:
    # List to store the scraping tasks
    tasks = []
    for handle in accounts:
        # Schedule the scraping task
        task = executor.submit(
            scrapeUsers, handle, userDFColumns 
        )
        tasks.append(task)
    # Wait for all tasks to complete and retrieve results
    for task in concurrent.futures.as_completed(tasks):
        result = task.result()
        listUsers.append(result)
 dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
 dfUsers.to_csv(senCSVPath, encoding='utf-8')
 # report time:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime(fTimeFormat))
 # Report timing info.
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndMerge.strftime(fTimeFormat))
 print("---")
 # calulate times:
 tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
 tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
 tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
 print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 )
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 print(listUsers)
 # close connection to logfiles.
 sys.stdout.close()
 sys.stderr.close()
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
    # save short csv
    tweet_df.to_csv(csv_path, encoding='utf-8')
    # sleep 1 second to not get blocked because of excessive requests
-    time.sleep(0.5)
+    time.sleep(0.5)
 def getHandles(di):
    """grabs accounts from senators-raw.csv
    Args:
        di (str): path to senators-raw.csv
    Returns:
        list: list containing str of senator account handles
    """
    accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
    alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
    alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
    accounts.extend(alt_accounts)
    return accounts
 def printHandles(accounts):
    """returns string with all accounts in a readable way.
    Args:
        accounts (list): list of str with handles
    Returns:
        str: containing text that can be written to txtfile
    """
    txt = ["Accounts to be scraped:\n"]
    for i, acc in enumerate(accounts): # print 5 accounts per line
        txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
        if i % 5 == 4: 
            txt.append(" \n")
    txt.append(f"\n{i} accounts in total.")
    return ''.join(txt)
 def scrapeUsers(handle, userDFColumns, maxTweets=1):
    currentTime = datetime.now()
    userList = []
    print(f'{currentTime:<30} Fetching: {handle:>15}')
    query = f'from:{handle}'
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i > maxTweets:
            break
        # Get user data and append to singleUserList
        userList = []
        for col in userDFColumns:
            singleUser = eval(f'tweet.user.{col}') 
            userList.append(singleUser)
    # Create dataframe using userList and userDFColumns
    #df = pd.DataFrame(userList, columns=userDFColumns)
    return userList