adds senator data scraper
This commit is contained in:
parent
90d5501ec8
commit
71e10a62d3
166
collectSenData.py
Normal file
166
collectSenData.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Jun 23 21:49:11 2023
|
||||||
|
|
||||||
|
@author: Michael
|
||||||
|
|
||||||
|
collectSenData.py scrapes accounts of senators for the following data:the
|
||||||
|
number of followers, the number of users the twitter account is following,
|
||||||
|
and how long the twitter account has existed.
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
- snscrape 0.6.2.20230321+
|
||||||
|
- pandas 2.0+
|
||||||
|
# IMPORTANT:
|
||||||
|
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||||
|
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||||
|
versions of snscrape will most likely fail to scrape all tweets because of
|
||||||
|
certain rate limits or other errors that may occur.
|
||||||
|
Install snscrape from local git repo to make shure that it fits the used version.
|
||||||
|
If snscrape is shall be installed from local repo, uncomment the following lines:
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
os.chdir('snscrape/')
|
||||||
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||||
|
os.chdir(wd)
|
||||||
|
|
||||||
|
|
||||||
|
# How to use:
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import glob
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Setup directories
|
||||||
|
# WD Michael
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
# WD Server
|
||||||
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senCSV = "ALL-SENATORS.csv"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
senCSVPath = wd + ud + senCSV
|
||||||
|
|
||||||
|
# Name of logfile
|
||||||
|
logfile = wd+"log/UserLog_"
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Define Timespan & time-format
|
||||||
|
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||||
|
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||||
|
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||||
|
no_slices = 24 # Number of slices / time periods.
|
||||||
|
|
||||||
|
# file time format
|
||||||
|
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||||
|
|
||||||
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||||
|
maxTweets = 5000
|
||||||
|
|
||||||
|
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||||
|
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||||
|
# get subparams just like in user where user id can be obtained by user.id
|
||||||
|
userDFColumns = [
|
||||||
|
"id",
|
||||||
|
"username",
|
||||||
|
"followersCount",
|
||||||
|
"friendsCount",
|
||||||
|
"verified",
|
||||||
|
"created"
|
||||||
|
]
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
################## do NOT change anything below this line ###################
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
from funs.Scrape import scrapeUsers, getHandles, printHandles
|
||||||
|
from funs.TimeSlice import convertTime
|
||||||
|
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Create logfile & log all outputs
|
||||||
|
# there are three logfile types to be found in /log.
|
||||||
|
# should be self explanatory.
|
||||||
|
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
|
||||||
|
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
|
||||||
|
sys.stderr = open(logfileErrors, "w")
|
||||||
|
sys.stdout = open(logfilen, "w")
|
||||||
|
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Senator Accounts
|
||||||
|
# Get accounts & alt-accounts from Senators-Datafile
|
||||||
|
accounts = getHandles(di)
|
||||||
|
|
||||||
|
# Print accounts to be scraped
|
||||||
|
print(printHandles(accounts))
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Scraping
|
||||||
|
# report time:
|
||||||
|
timeStartScrape = datetime.now()
|
||||||
|
print("Starting scraping at:")
|
||||||
|
print(timeStartScrape.strftime(fTimeFormat))
|
||||||
|
print("---")
|
||||||
|
|
||||||
|
# Iterate over each Twitter account using multiprocessing
|
||||||
|
listUsers = []
|
||||||
|
# Iterate over each Twitter account using multiprocessing
|
||||||
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||||
|
# List to store the scraping tasks
|
||||||
|
tasks = []
|
||||||
|
for handle in accounts:
|
||||||
|
# Schedule the scraping task
|
||||||
|
task = executor.submit(
|
||||||
|
scrapeUsers, handle, userDFColumns
|
||||||
|
)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
# Wait for all tasks to complete and retrieve results
|
||||||
|
for task in concurrent.futures.as_completed(tasks):
|
||||||
|
result = task.result()
|
||||||
|
listUsers.append(result)
|
||||||
|
|
||||||
|
dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
|
||||||
|
dfUsers.to_csv(senCSVPath, encoding='utf-8')
|
||||||
|
|
||||||
|
# report time:
|
||||||
|
timeEndScrape = datetime.now()
|
||||||
|
print("---")
|
||||||
|
print("End of scraping at:")
|
||||||
|
print(timeEndScrape.strftime(fTimeFormat))
|
||||||
|
|
||||||
|
# Report timing info.
|
||||||
|
timeEndMerge = datetime.now()
|
||||||
|
print("---")
|
||||||
|
print("End of scraping at:")
|
||||||
|
print(timeEndMerge.strftime(fTimeFormat))
|
||||||
|
print("---")
|
||||||
|
# calulate times:
|
||||||
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
|
||||||
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
|
||||||
|
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
|
||||||
|
print(
|
||||||
|
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
||||||
|
)
|
||||||
|
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||||
|
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||||
|
|
||||||
|
print(listUsers)
|
||||||
|
# close connection to logfiles.
|
||||||
|
sys.stdout.close()
|
||||||
|
sys.stderr.close()
|
@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
|
|||||||
# save short csv
|
# save short csv
|
||||||
tweet_df.to_csv(csv_path, encoding='utf-8')
|
tweet_df.to_csv(csv_path, encoding='utf-8')
|
||||||
# sleep 1 second to not get blocked because of excessive requests
|
# sleep 1 second to not get blocked because of excessive requests
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
def getHandles(di):
|
||||||
|
"""grabs accounts from senators-raw.csv
|
||||||
|
|
||||||
|
Args:
|
||||||
|
di (str): path to senators-raw.csv
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: list containing str of senator account handles
|
||||||
|
"""
|
||||||
|
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
|
||||||
|
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
|
||||||
|
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||||
|
accounts.extend(alt_accounts)
|
||||||
|
return accounts
|
||||||
|
|
||||||
|
def printHandles(accounts):
|
||||||
|
"""returns string with all accounts in a readable way.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
accounts (list): list of str with handles
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: containing text that can be written to txtfile
|
||||||
|
"""
|
||||||
|
txt = ["Accounts to be scraped:\n"]
|
||||||
|
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||||
|
txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
|
||||||
|
if i % 5 == 4:
|
||||||
|
txt.append(" \n")
|
||||||
|
txt.append(f"\n{i} accounts in total.")
|
||||||
|
return ''.join(txt)
|
||||||
|
|
||||||
|
def scrapeUsers(handle, userDFColumns, maxTweets=1):
|
||||||
|
currentTime = datetime.now()
|
||||||
|
userList = []
|
||||||
|
print(f'{currentTime:<30} Fetching: {handle:>15}')
|
||||||
|
query = f'from:{handle}'
|
||||||
|
|
||||||
|
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
||||||
|
if i > maxTweets:
|
||||||
|
break
|
||||||
|
# Get user data and append to singleUserList
|
||||||
|
userList = []
|
||||||
|
for col in userDFColumns:
|
||||||
|
singleUser = eval(f'tweet.user.{col}')
|
||||||
|
userList.append(singleUser)
|
||||||
|
|
||||||
|
# Create dataframe using userList and userDFColumns
|
||||||
|
#df = pd.DataFrame(userList, columns=userDFColumns)
|
||||||
|
return userList
|
Loading…
x
Reference in New Issue
Block a user