From 599202ae4d27d6663442bfd130f445500d719cd8 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 Jun 2023 13:00:23 +0200 Subject: [PATCH] adds checks & logs --- collect.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- config.py | 3 +++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/collect.py b/collect.py index d692453..79f2d6a 100644 --- a/collect.py +++ b/collect.py @@ -56,19 +56,29 @@ import os import pandas as pd import glob import time +import sys +from datetime import datetime ## Import other files from config import * import snscrape.modules.twitter as sntwitter -from funs.TimeSlice import get_Tslices +from funs.TimeSlice import * from funs.ClearDupes import deDupe +# create logfile & log all outputs +logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt' +logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt' +sys.stderr = open(logfileErrors, 'w') +sys.stdout = open(logfilen, 'w') + ## Create List of time-period-slices time_slices = get_Tslices(ts_beg, ts_end, no_slices) # Print slices print('Time-period-slices:') for slice in time_slices: print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time']) +print('---') + ## Keywords keywords = [] @@ -80,13 +90,23 @@ with open('data/keywords.txt', 'r') as file: for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) +print('---') ## Senator Accounts # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() +print('Accounts to be scraped:') +print(accounts) +print(alt_accounts) +print('---') ## Scraping +timeStartScrape = datetime.now() +print("Starting scraping at:") +print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S')) +print('---') + # Iterate over each Twitter account for handle in accounts: # Iterate over each time slice @@ -198,6 +218,11 @@ for handle in accounts: # sleep 1 second to not get blocked because of excessive requests time.sleep(1) +timeEndScrape = datetime.now() +print("---") +print("End of scraping at:") +print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S')) + ## Merge CSV-Files to file_alltweets # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. os.chdir(path_to_tweetdfs) @@ -216,4 +241,19 @@ with open(file_alltweets,"wb") as fout: with open(file, "rb") as f: next(f) # skip the header fout.write(f.read()) -os.chdir(wd) \ No newline at end of file +os.chdir(wd) + +timeEndMerge = datetime.now() +print("---") +print("End of scraping at:") +print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S')) +print("---") +tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) +tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) +tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) +print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds") +print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") +print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") + +sys.stdout.close() +sys.stderr.close() \ No newline at end of file diff --git a/config.py b/config.py index 8500dca..4adbb90 100644 --- a/config.py +++ b/config.py @@ -29,6 +29,9 @@ no_slices = 24 # Number of slices / time periods. # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 +# Name of logfile +logfile = 'log/log_' + ## Install snscrape from local git repo to make shure that it fits the used version. # If snscrape is already installed, uncomment the following lines: