From 599202ae4d27d6663442bfd130f445500d719cd8 Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mischbeck.de>
Date: Fri, 23 Jun 2023 13:00:23 +0200
Subject: [PATCH] adds checks & logs

---
 collect.py | 44 ++++++++++++++++++++++++++++++++++++++++++--
 config.py  |  3 +++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/collect.py b/collect.py
index d692453..79f2d6a 100644
--- a/collect.py
+++ b/collect.py
@@ -56,19 +56,29 @@ import os
 import pandas as pd
 import glob
 import time
+import sys
+from datetime import datetime
 
 ## Import other files
 from config import *
 import snscrape.modules.twitter as sntwitter
-from funs.TimeSlice import get_Tslices
+from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
 
+# create logfile & log all outputs
+logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
+logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
+sys.stderr = open(logfileErrors, 'w')
+sys.stdout = open(logfilen, 'w')
+
 ## Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
 print('Time-period-slices:')
 for slice in time_slices:
     print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
+print('---')
+
 
 ## Keywords
 keywords = []
@@ -80,13 +90,23 @@ with open('data/keywords.txt', 'r') as file:
     for line in lines:
         keyword = line.strip()  # Remove the newline character
         keywords.append(keyword)
+print('---')
 
 ## Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
 alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
+print('Accounts to be scraped:')
+print(accounts)
+print(alt_accounts)
+print('---')
 
 ## Scraping
+timeStartScrape = datetime.now()
+print("Starting scraping at:")
+print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
+print('---')
+
 # Iterate over each Twitter account
 for handle in accounts:
     # Iterate over each time slice
@@ -198,6 +218,11 @@ for handle in accounts:
         # sleep 1 second to not get blocked because of excessive requests
         time.sleep(1)
 
+timeEndScrape = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
+
 ## Merge CSV-Files to file_alltweets
 # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 os.chdir(path_to_tweetdfs)
@@ -216,4 +241,19 @@ with open(file_alltweets,"wb") as fout:
         with open(file, "rb") as f:
             next(f) # skip the header
             fout.write(f.read())
-os.chdir(wd)
\ No newline at end of file
+os.chdir(wd)
+
+timeEndMerge = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
+print("---")
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
+print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
+print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
+print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
+
+sys.stdout.close()
+sys.stderr.close()
\ No newline at end of file
diff --git a/config.py b/config.py
index 8500dca..4adbb90 100644
--- a/config.py
+++ b/config.py
@@ -29,6 +29,9 @@ no_slices = 24 # Number of slices / time periods.
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 
+# Name of logfile
+logfile = 'log/log_'
+
 
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines: