adds docstrings to functions. adds several comments.

This commit is contained in:
Michael Beck 2023-06-23 20:26:16 +02:00
parent e8ba02ca0f
commit dc2e17cc2f
3 changed files with 100 additions and 44 deletions

View File

@ -4,6 +4,8 @@ Created on Thu Jun 8 01:08:21 2023
@author: Michael @author: Michael
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
Following files are necessary: Following files are necessary:
config.py config.py
Used to configure everything that's needed for this script. Used to configure everything that's needed for this script.
@ -60,7 +62,8 @@ import sys
from datetime import datetime from datetime import datetime
import concurrent.futures import concurrent.futures
## Setup directories ###################
# Setup directories
# WD Michael # WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/" wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server # WD Server
@ -72,9 +75,14 @@ td = "data/tweets/"
# Name of file that all tweets will be written to # Name of file that all tweets will be written to
file_alltweets = "ALL-SENATORS-TWEETS.csv" file_alltweets = "ALL-SENATORS-TWEETS.csv"
# don't change this one
path_to_tweetdfs = wd + td path_to_tweetdfs = wd + td
## Define Timespan # Name of logfile
logfile = wd+"log/log_"
###################
# Define Timespan & time-format
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z" # start of scraping ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping ts_end = "2023-01-03T00:00:00Z" # end of straping
@ -86,10 +94,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched. # Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000 maxTweets = 5000
# Name of logfile ###################
logfile = wd+"log/log_" # Install snscrape from local git repo to make shure that it fits the used version.
## Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines: # If snscrape is already installed, uncomment the following lines:
""" """
import subprocess import subprocess
@ -98,7 +104,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd) os.chdir(wd)
""" """
# Columns for tweet dataframe # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
# get subparams just like in user where user id can be obtained by user.id
tweetDFColumns = [ tweetDFColumns = [
"id", "id",
"user.id", "user.id",
@ -135,18 +143,22 @@ tweetDFColumns = [
"source", "source",
] ]
## Import other files ## Import functions
from funs.TimeSlice import * from funs.TimeSlice import *
from funs.ClearDupes import deDupe from funs.ClearDupes import deDupe
from funs.Scrape import scrapeTweets from funs.Scrape import scrapeTweets
# create logfile & log all outputs ###################
# Create logfile & log all outputs
# there are three logfile types to be found in /log.
# should be self explanatory.
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
sys.stderr = open(logfileErrors, "w") sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w") sys.stdout = open(logfilen, "w")
## Create List of time-period-slices ###################
# Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices) time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices # Print slices
print("Time-period-slices:") print("Time-period-slices:")
@ -154,7 +166,9 @@ for slice in time_slices:
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"]) print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
print("---") print("---")
## Keywords ###################
# Keywords
# read keywords from a file and write to list.
keywords = [] keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe("data/keywords-raw.txt", "data/keywords.txt") deDupe("data/keywords-raw.txt", "data/keywords.txt")
@ -166,7 +180,8 @@ with open("data/keywords.txt", "r") as file:
keywords.append(keyword) keywords.append(keyword)
print("---") print("---")
## Senator Accounts ###################
# Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile # Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
@ -181,43 +196,50 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
print("\n") print("\n")
print(f"\n{i} accounts in total.\n---") print(f"\n{i} accounts in total.\n---")
## Scraping ###################
# Scraping
# report time:
timeStartScrape = datetime.now() timeStartScrape = datetime.now()
print("Starting scraping at:") print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat)) print(timeStartScrape.strftime(fTimeFormat))
print("---") print("---")
# Iterate over each Twitter account using multiprocessing # Iterate over each Twitter account using multiprocessing
# with concurrent.futures.ProcessPoolExecutor() as executor: with concurrent.futures.ProcessPoolExecutor() as executor:
# # List to store the scraping tasks # List to store the scraping tasks
# tasks = [] tasks = []
# for handle in accounts: for handle in accounts:
# # Iterate over each time slice # Iterate over each time slice
# for slice_data in time_slices: for slice_data in time_slices:
# # ... Code to prepare the slice_data ... # ... Code to prepare the slice_data ...
# # Schedule the scraping task # Schedule the scraping task
# task = executor.submit( task = executor.submit(
# scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix']
# ) )
# # Store the handle and slice_data as attributes of the task # Store the handle and slice_data as attributes of the task
# # Wait for all tasks to complete # Wait for all tasks to complete
# concurrent.futures.wait(tasks) concurrent.futures.wait(tasks)
# report time:
timeEndScrape = datetime.now() timeEndScrape = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat)) print(timeEndScrape.strftime(fTimeFormat))
## Merge CSV-Files to file_alltweets. ###################
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. # Merge CSV-Files to file_alltweets.
os.chdir(path_to_tweetdfs) # fastest way is to save the slices seperately and then add every file to the
# At first check, whether all slices are present. # output instead of using pandas or anything else.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
## At first check, whether all slices are present.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") ???
# Create list of all files that should be in the folder:
AllFilesList = [] AllFilesList = []
for handle in accounts: for handle in accounts:
for tslice in time_slices: for tslice in time_slices:
suffix = tslice['suffix'] suffix = tslice['suffix']
AllFilesList.append(f"Tweets-{handle}{suffix}.csv") AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
# report missing files to "log_*_missing.txt"
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout: with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
for file in AllFilesList: for file in AllFilesList:
if file not in tweetfiles: if file not in tweetfiles:
@ -225,8 +247,10 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w"
else: else:
fout.write('all slices scraped.') fout.write('all slices scraped.')
## Merge .csv files.
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge # check if file_alltweets (previously scraped tweets that have been merged
# into one file) exists in tweetfiles list, if it exists, remove from list
# to not include it in the following merge
if file_alltweets in tweetfiles: if file_alltweets in tweetfiles:
tweetfiles.remove(file_alltweets) tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets # Go through all csv files and merge them into file_alltweets
@ -240,21 +264,24 @@ if tweetfiles:
with open(file, "rb") as f: with open(file, "rb") as f:
next(f) # skip the header next(f) # skip the header
fout.write(f.read()) fout.write(f.read())
os.chdir(wd) os.chdir(wd) # go back to wd
# Report timing info.
timeEndMerge = datetime.now() timeEndMerge = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat)) print(timeEndMerge.strftime(fTimeFormat))
print("---") print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # calulate times:
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
print( print(
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
) )
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
# close connection to logfiles.
sys.stdout.close() sys.stdout.close()
sys.stderr.close() sys.stderr.close()

View File

@ -3,13 +3,22 @@ import time
import pandas as pd import pandas as pd
import snscrape.modules.twitter as sntwitter import snscrape.modules.twitter as sntwitter
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, maxTweets = 5000):
"""Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
Args:
handle (str): twitter handle of account to be scraped
keywords (list): list of strings containing the keywords that the tweets shall be searched for
td (str): tweet file output path
tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
"""
i = 0 i = 0
currentTime = datetime.now() currentTime = datetime.now()
ts_beg = slice_data['beg_time']
ts_end = slice_data['end_time']
suffix = slice_data['suffix']
tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv" tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
# create empty tweetlist that will be filled with tweets of current sen # create empty tweetlist that will be filled with tweets of current sen

View File

@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023
# create slices # create slices
def get_Tslices(ts_beg, ts_end, no_slices): def get_Tslices(ts_beg, ts_end, no_slices):
"""Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
Args:
ts_beg (datetime): Datetime start of overall period to be sliced.
ts_end (datetime): Datetime end of overall period to be sliced.
no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
Returns:
list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
"""
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ') ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):
# For log time conversions (seconds to days, hours, minutes) # For log time conversions (seconds to days, hours, minutes)
def convertTime(duration): def convertTime(duration):
"""Converts seconds to hours, minutes and seconds.
Args:
duration (int): seconds
Returns:
int: hours
int: minutes
int: seconds
"""
days, seconds = duration.days, duration.seconds days, seconds = duration.days, duration.seconds
hours = days * 24 + seconds // 3600 hours = days * 24 + seconds // 3600
minutes = (seconds % 3600) // 60 minutes = (seconds % 3600) // 60