From 1b43b295ce36f4301369213c81e719192b664135 Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mischbeck.de>
Date: Fri, 23 Jun 2023 17:47:23 +0200
Subject: [PATCH] adds filechecks

---
 collect.py     | 51 +++++++++++++++++++++++++++-----------------------
 funs/Scrape.py | 10 ++++++----
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/collect.py b/collect.py
index 256902c..dee5f0b 100644
--- a/collect.py
+++ b/collect.py
@@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
 no_slices = 24  # Number of slices / time periods.
 
+# file time format
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 
 # Name of logfile
-logfile = "log/log_"
-
+logfile = wd+"log/log_"
 
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
@@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets
 
 # create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
-logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")
 
@@ -169,19 +171,20 @@ print("---")
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
-accounts.append(alt_accounts)
+accounts.extend(alt_accounts)
+
 # Print accounts to be scraped
 print("Accounts to be scraped:")
 for i, acc in enumerate(accounts): # print 5 accounts per line
     print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
     if i % 5 == 4: 
-        print "\n"
-print("\n---")
+        print("\n")
+print(f"\n{i} accounts in total.\n---")
 
 ## Scraping
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
-print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeStartScrape.strftime(fTimeFormat))
 print("---")
 
 # Iterate over each Twitter account using multiprocessing
@@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
             # ... code to prepare the slice_data ...
 
             # Schedule the scraping task
-            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
+            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns)
             tasks.append(task)
 
     # Wait for all tasks to complete
@@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeEndScrape.strftime(fTimeFormat))
 
 ## Merge CSV-Files to file_alltweets.
 # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
@@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs)
 tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
 AllFilesList = []
 for handle in accounts:
-    for suffix in time_slices:
-        AllFilesList.append("Tweets-{handle}{suffix}.csv")
-with open(f"{logfile}missing-{timeStartScrape}") as fout:
+    for tslice in time_slices:
+        suffix = tslice['suffix']
+        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
+with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
     for file in AllFilesList:
         if file not in tweetfiles:
-            fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message.
+            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
 # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 if file_alltweets in tweetfiles:
     tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
-with open(file_alltweets, "wb") as fout:
-    # first file (because of the header):
-    with open(tweetfiles[0], "rb") as f:
-        fout.write(f.read())
-    # other files without the header:
-    for file in tweetfiles[1:]:
-        with open(file, "rb") as f:
-            next(f)  # skip the header
+if len(tweetfiles) > 0:
+    with open(file_alltweets, "wb") as fout:
+        # first file (because of the header):
+        with open(tweetfiles[0], "rb") as f:
             fout.write(f.read())
+        # other files without the header:
+        for file in tweetfiles[1:]:
+            with open(file, "rb") as f:
+                next(f)  # skip the header
+                fout.write(f.read())
 os.chdir(wd)
 
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeEndMerge.strftime(fTimeFormat))
 print("---")
 tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
diff --git a/funs/Scrape.py b/funs/Scrape.py
index 1ee4edb..8891b62 100644
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@@ -1,7 +1,9 @@
-def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
-    from datetime import datetime
-    currentTime = datetime.now
+def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
+    from datetime import datetime, time
+    import pandas as pd
     import snscrape.modules.twitter as sntwitter
+    
+    currentTime = datetime.now
     ts_beg = slice_data['beg_time']
     ts_end = slice_data['end_time']
     suffix = slice_data['suffix']
@@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
     # save short csv
     tweet_df.to_csv(csv_path, encoding='utf-8')
     # sleep 1 second to not get blocked because of excessive requests
-    # time.sleep(0.5)
\ No newline at end of file
+    time.sleep(0.5)
\ No newline at end of file