adds senator data scraper

2023-06-23 23:53:31 +02:00
parent 90d5501ec8
commit 71e10a62d3
2 changed files with 218 additions and 1 deletions
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun  23 21:49:11 2023
+
+@author: Michael
+
+collectSenData.py scrapes accounts of senators for the following data:the 
+number of followers, the number of users the twitter account is following, 
+and how long the twitter account has existed.
+
+# Requirements:
+    - snscrape 0.6.2.20230321+
+    - pandas 2.0+
+# IMPORTANT:
+This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
+included in 'snscrape/' as a git repository for better reproducibility. Earlier
+versions of snscrape will most likely fail to scrape all tweets because of 
+certain rate limits or other errors that may occur.
+Install snscrape from local git repo to make shure that it fits the used version.
+If snscrape is shall be installed from local repo, uncomment the following lines:
+
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+
+ 
+# How to use:
+"""
+
+import os
+import pandas as pd
+import glob
+import time
+import sys
+from datetime import datetime
+import concurrent.futures
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+
+# Name of logfile
+logfile = wd+"log/UserLog_"
+
+###################
+# Define Timespan & time-format
+# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
+ts_end = "2023-01-03T00:00:00Z"  # end of straping
+no_slices = 24  # Number of slices / time periods.
+
+# file time format
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+
+# Maximum tweets to be scraped by snscrape. Can be left untouched.
+maxTweets = 5000
+
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
+# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
+#   get subparams just like in user where user id can be obtained by user.id 
+userDFColumns = [
+    "id",
+    "username",
+    "followersCount",
+    "friendsCount",
+    "verified",
+    "created"
+]
+
+#############################################################################
+################## do NOT change anything below this line ###################
+#############################################################################
+
+from funs.Scrape import scrapeUsers, getHandles, printHandles
+from funs.TimeSlice import convertTime
+
+
+################### 
+# Create logfile & log all outputs
+#   there are three logfile types to be found in /log.
+#   should be self explanatory.
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
+sys.stderr = open(logfileErrors, "w")
+sys.stdout = open(logfilen, "w")
+
+
+###################
+# Senator Accounts
+# Get accounts & alt-accounts from Senators-Datafile
+accounts = getHandles(di)
+
+# Print accounts to be scraped
+print(printHandles(accounts))
+
+###################
+# Scraping
+# report time:
+timeStartScrape = datetime.now()
+print("Starting scraping at:")
+print(timeStartScrape.strftime(fTimeFormat))
+print("---")
+
+# Iterate over each Twitter account using multiprocessing
+listUsers = []
+# Iterate over each Twitter account using multiprocessing
+with concurrent.futures.ProcessPoolExecutor() as executor:
+    # List to store the scraping tasks
+    tasks = []
+    for handle in accounts:
+        # Schedule the scraping task
+        task = executor.submit(
+            scrapeUsers, handle, userDFColumns 
+        )
+        tasks.append(task)
+    
+    # Wait for all tasks to complete and retrieve results
+    for task in concurrent.futures.as_completed(tasks):
+        result = task.result()
+        listUsers.append(result)
+
+dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
+dfUsers.to_csv(senCSVPath, encoding='utf-8')
+
+# report time:
+timeEndScrape = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndScrape.strftime(fTimeFormat))
+
+# Report timing info.
+timeEndMerge = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndMerge.strftime(fTimeFormat))
+print("---")
+# calulate times:
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
+print(
+    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
+)
+print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
+print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
+
+print(listUsers)
+# close connection to logfiles.
+sys.stdout.close()
+sys.stderr.close()
@@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
    # save short csv
    tweet_df.to_csv(csv_path, encoding='utf-8')
    # sleep 1 second to not get blocked because of excessive requests
-    time.sleep(0.5)
+    time.sleep(0.5)
+
+def getHandles(di):
+    """grabs accounts from senators-raw.csv
+
+    Args:
+        di (str): path to senators-raw.csv
+
+    Returns:
+        list: list containing str of senator account handles
+    """
+    accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
+    alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
+    alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
+    accounts.extend(alt_accounts)
+    return accounts
+
+def printHandles(accounts):
+    """returns string with all accounts in a readable way.
+
+    Args:
+        accounts (list): list of str with handles
+
+    Returns:
+        str: containing text that can be written to txtfile
+    """
+    txt = ["Accounts to be scraped:\n"]
+    for i, acc in enumerate(accounts): # print 5 accounts per line
+        txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
+        if i % 5 == 4: 
+            txt.append(" \n")
+    txt.append(f"\n{i} accounts in total.")
+    return ''.join(txt)
+
+def scrapeUsers(handle, userDFColumns, maxTweets=1):
+    currentTime = datetime.now()
+    userList = []
+    print(f'{currentTime:<30} Fetching: {handle:>15}')
+    query = f'from:{handle}'
+    
+    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
+        if i > maxTweets:
+            break
+        # Get user data and append to singleUserList
+        userList = []
+        for col in userDFColumns:
+            singleUser = eval(f'tweet.user.{col}') 
+            userList.append(singleUser)
+            
+    # Create dataframe using userList and userDFColumns
+    #df = pd.DataFrame(userList, columns=userDFColumns)
+    return userList