From dc2e17cc2f4c7859cfc23098d7cd1aa05dab555f Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 Jun 2023 20:26:16 +0200 Subject: [PATCH] adds docstrings to functions. adds several comments. --- collect.py | 107 +++++++++++++++++++++++++++++----------------- funs/Scrape.py | 17 ++++++-- funs/TimeSlice.py | 20 +++++++++ 3 files changed, 100 insertions(+), 44 deletions(-) diff --git a/collect.py b/collect.py index 4241891..e5ff1a1 100644 --- a/collect.py +++ b/collect.py @@ -4,6 +4,8 @@ Created on Thu Jun 8 01:08:21 2023 @author: Michael +# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html + Following files are necessary: config.py Used to configure everything that's needed for this script. @@ -60,7 +62,8 @@ import sys from datetime import datetime import concurrent.futures -## Setup directories +################### +# Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server @@ -72,9 +75,14 @@ td = "data/tweets/" # Name of file that all tweets will be written to file_alltweets = "ALL-SENATORS-TWEETS.csv" +# don't change this one path_to_tweetdfs = wd + td -## Define Timespan +# Name of logfile +logfile = wd+"log/log_" + +################### +# Define Timespan & time-format # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) ts_beg = "2020-01-01T00:00:00Z" # start of scraping ts_end = "2023-01-03T00:00:00Z" # end of straping @@ -86,10 +94,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S" # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 -# Name of logfile -logfile = wd+"log/log_" - -## Install snscrape from local git repo to make shure that it fits the used version. +################### +# Install snscrape from local git repo to make shure that it fits the used version. # If snscrape is already installed, uncomment the following lines: """ import subprocess @@ -98,7 +104,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) os.chdir(wd) """ -# Columns for tweet dataframe +# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: +# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html +# get subparams just like in user where user id can be obtained by user.id tweetDFColumns = [ "id", "user.id", @@ -135,18 +143,22 @@ tweetDFColumns = [ "source", ] -## Import other files +## Import functions from funs.TimeSlice import * from funs.ClearDupes import deDupe from funs.Scrape import scrapeTweets -# create logfile & log all outputs +################### +# Create logfile & log all outputs +# there are three logfile types to be found in /log. +# should be self explanatory. logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" sys.stderr = open(logfileErrors, "w") sys.stdout = open(logfilen, "w") -## Create List of time-period-slices +################### +# Create List of time-period-slices time_slices = get_Tslices(ts_beg, ts_end, no_slices) # Print slices print("Time-period-slices:") @@ -154,7 +166,9 @@ for slice in time_slices: print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"]) print("---") -## Keywords +################### +# Keywords +# read keywords from a file and write to list. keywords = [] # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' deDupe("data/keywords-raw.txt", "data/keywords.txt") @@ -166,7 +180,8 @@ with open("data/keywords.txt", "r") as file: keywords.append(keyword) print("---") -## Senator Accounts +################### +# Senator Accounts # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() @@ -181,43 +196,50 @@ for i, acc in enumerate(accounts): # print 5 accounts per line print("\n") print(f"\n{i} accounts in total.\n---") -## Scraping +################### +# Scraping +# report time: timeStartScrape = datetime.now() print("Starting scraping at:") print(timeStartScrape.strftime(fTimeFormat)) print("---") # Iterate over each Twitter account using multiprocessing -# with concurrent.futures.ProcessPoolExecutor() as executor: -# # List to store the scraping tasks -# tasks = [] -# for handle in accounts: -# # Iterate over each time slice -# for slice_data in time_slices: -# # ... Code to prepare the slice_data ... -# # Schedule the scraping task -# task = executor.submit( -# scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns -# ) -# # Store the handle and slice_data as attributes of the task -# # Wait for all tasks to complete -# concurrent.futures.wait(tasks) - +with concurrent.futures.ProcessPoolExecutor() as executor: + # List to store the scraping tasks + tasks = [] + for handle in accounts: + # Iterate over each time slice + for slice_data in time_slices: + # ... Code to prepare the slice_data ... + # Schedule the scraping task + task = executor.submit( + scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix'] + ) + # Store the handle and slice_data as attributes of the task + # Wait for all tasks to complete + concurrent.futures.wait(tasks) + +# report time: timeEndScrape = datetime.now() print("---") print("End of scraping at:") print(timeEndScrape.strftime(fTimeFormat)) -## Merge CSV-Files to file_alltweets. -# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. -os.chdir(path_to_tweetdfs) -# At first check, whether all slices are present. -tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") +################### +# Merge CSV-Files to file_alltweets. +# fastest way is to save the slices seperately and then add every file to the +# output instead of using pandas or anything else. +os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir +## At first check, whether all slices are present. +tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") ??? +# Create list of all files that should be in the folder: AllFilesList = [] for handle in accounts: for tslice in time_slices: suffix = tslice['suffix'] - AllFilesList.append(f"Tweets-{handle}{suffix}.csv") + AllFilesList.append(f"Tweets-{handle}{suffix}.csv") +# report missing files to "log_*_missing.txt" with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout: for file in AllFilesList: if file not in tweetfiles: @@ -225,8 +247,10 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w" else: fout.write('all slices scraped.') - -# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge +## Merge .csv files. +# check if file_alltweets (previously scraped tweets that have been merged +# into one file) exists in tweetfiles list, if it exists, remove from list +# to not include it in the following merge if file_alltweets in tweetfiles: tweetfiles.remove(file_alltweets) # Go through all csv files and merge them into file_alltweets @@ -240,21 +264,24 @@ if tweetfiles: with open(file, "rb") as f: next(f) # skip the header fout.write(f.read()) -os.chdir(wd) +os.chdir(wd) # go back to wd +# Report timing info. timeEndMerge = datetime.now() print("---") print("End of scraping at:") print(timeEndMerge.strftime(fTimeFormat)) print("---") -tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) -tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) -tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) +# calulate times: +tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time +tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time +tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time print( f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" ) print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") +# close connection to logfiles. sys.stdout.close() sys.stderr.close() diff --git a/funs/Scrape.py b/funs/Scrape.py index 75bae0c..59e1a21 100644 --- a/funs/Scrape.py +++ b/funs/Scrape.py @@ -3,13 +3,22 @@ import time import pandas as pd import snscrape.modules.twitter as sntwitter -def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): +def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, maxTweets = 5000): + """Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter. + + Args: + handle (str): twitter handle of account to be scraped + keywords (list): list of strings containing the keywords that the tweets shall be searched for + td (str): tweet file output path + tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet + ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) + ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) + suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv" + maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000. + """ i = 0 currentTime = datetime.now() - ts_beg = slice_data['beg_time'] - ts_end = slice_data['end_time'] - suffix = slice_data['suffix'] tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv" # create empty tweetlist that will be filled with tweets of current sen diff --git a/funs/TimeSlice.py b/funs/TimeSlice.py index 14a1ea8..ab6d78a 100644 --- a/funs/TimeSlice.py +++ b/funs/TimeSlice.py @@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023 # create slices def get_Tslices(ts_beg, ts_end, no_slices): + """Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period. + + Args: + ts_beg (datetime): Datetime start of overall period to be sliced. + ts_end (datetime): Datetime end of overall period to be sliced. + no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each. + + Returns: + list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1) + """ from datetime import datetime from datetime import timedelta ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ') @@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices): # For log time conversions (seconds to days, hours, minutes) def convertTime(duration): + """Converts seconds to hours, minutes and seconds. + + Args: + duration (int): seconds + + Returns: + int: hours + int: minutes + int: seconds + """ days, seconds = duration.days, duration.seconds hours = days * 24 + seconds // 3600 minutes = (seconds % 3600) // 60