# -*- coding: utf-8 -*- """ Created on Thu Jun 23 21:49:11 2023 @author: Michael collectSenData.py scrapes accounts of senators for the following data:the number of followers, the number of users the twitter account is following, and how long the twitter account has existed. # Requirements: - snscrape 0.6.2.20230321+ - pandas 2.0+ # IMPORTANT: This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is included in 'snscrape/' as a git repository for better reproducibility. Earlier versions of snscrape will most likely fail to scrape all tweets because of certain rate limits or other errors that may occur. Install snscrape from local git repo to make shure that it fits the used version. If snscrape is shall be installed from local repo, uncomment the following lines: import subprocess os.chdir('snscrape/') subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) os.chdir(wd) # How to use: """ import os import pandas as pd import glob import time import sys from datetime import datetime import concurrent.futures ################### # Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # datafile input directory di = "data/IN/" # Tweet-datafile output directory ud = "data/OUT/" # Name of file that all senator data will be written to senCSV = "ALL-SENATORS.csv" # don't change this one senCSVPath = wd + ud + senCSV # Name of logfile logfile = wd+"log/UserLog_" ################### # Define Timespan & time-format # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) ts_beg = "2020-01-01T00:00:00Z" # start of scraping ts_end = "2023-01-03T00:00:00Z" # end of straping no_slices = 24 # Number of slices / time periods. # file time format fTimeFormat = "%Y-%m-%d_%H-%M-%S" # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html # get subparams just like in user where user id can be obtained by user.id userDFColumns = [ "id", "username", "followersCount", "friendsCount", "verified", "created" ] ############################################################################# ################## do NOT change anything below this line ################### ############################################################################# from funs.Scrape import scrapeUsers, getHandles, printHandles from funs.TimeSlice import convertTime ################### # Create logfile & log all outputs # there are three logfile types to be found in /log. # should be self explanatory. logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log" logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log" sys.stderr = open(logfileErrors, "w") sys.stdout = open(logfilen, "w") ################### # Senator Accounts # Get accounts & alt-accounts from Senators-Datafile accounts = getHandles(di) # Print accounts to be scraped print(printHandles(accounts)) ################### # Scraping # report time: timeStartScrape = datetime.now() print("Starting scraping at:") print(timeStartScrape.strftime(fTimeFormat)) print("---") # Iterate over each Twitter account using multiprocessing listUsers = [] # Iterate over each Twitter account using multiprocessing with concurrent.futures.ProcessPoolExecutor() as executor: # List to store the scraping tasks tasks = [] for handle in accounts: # Schedule the scraping task task = executor.submit( scrapeUsers, handle, userDFColumns ) tasks.append(task) # Wait for all tasks to complete and retrieve results for task in concurrent.futures.as_completed(tasks): result = task.result() listUsers.append(result) dfUsers = pd.DataFrame(listUsers, columns=userDFColumns) dfUsers.to_csv(senCSVPath, encoding='utf-8') # report time: timeEndScrape = datetime.now() print("---") print("End of scraping at:") print(timeEndScrape.strftime(fTimeFormat)) # Report timing info. timeEndMerge = datetime.now() print("---") print("End of scraping at:") print(timeEndMerge.strftime(fTimeFormat)) print("---") # calulate times: tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time print( f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" ) print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") print(listUsers) # close connection to logfiles. sys.stdout.close() sys.stderr.close()