adds filechecks

2023-06-23 17:47:23 +02:00
parent fb7a70cf66
commit 1b43b295ce
2 changed files with 34 additions and 27 deletions
--- a/collect.py
+++ b/collect.py
@@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
 no_slices = 24  # Number of slices / time periods.

+# file time format
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000

 # Name of logfile
-logfile = "log/log_"
-
+logfile = wd+"log/log_"

 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
@@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets

 # create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
-logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")

@@ -169,19 +171,20 @@ print("---")
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
-accounts.append(alt_accounts)
+accounts.extend(alt_accounts)
+
 # Print accounts to be scraped
 print("Accounts to be scraped:")
 for i, acc in enumerate(accounts): # print 5 accounts per line
    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
    if i % 5 == 4: 
-        print "\n"
-print("\n---")
+        print("\n")
+print(f"\n{i} accounts in total.\n---")

 ## Scraping
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
-print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeStartScrape.strftime(fTimeFormat))
 print("---")

 # Iterate over each Twitter account using multiprocessing
@@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
            # ... code to prepare the slice_data ...

            # Schedule the scraping task
-            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
+            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns)
            tasks.append(task)

    # Wait for all tasks to complete
@@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeEndScrape.strftime(fTimeFormat))

 ## Merge CSV-Files to file_alltweets.
 # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
@@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs)
 tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
 AllFilesList = []
 for handle in accounts:
-    for suffix in time_slices:
-        AllFilesList.append("Tweets-{handle}{suffix}.csv")
-with open(f"{logfile}missing-{timeStartScrape}") as fout:
+    for tslice in time_slices:
+        suffix = tslice['suffix']
+        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
+with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
    for file in AllFilesList:
        if file not in tweetfiles:
-            fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message.
+            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
 # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
-with open(file_alltweets, "wb") as fout:
-    # first file (because of the header):
-    with open(tweetfiles[0], "rb") as f:
-        fout.write(f.read())
-    # other files without the header:
-    for file in tweetfiles[1:]:
-        with open(file, "rb") as f:
-            next(f)  # skip the header
+if len(tweetfiles) > 0:
+    with open(file_alltweets, "wb") as fout:
+        # first file (because of the header):
+        with open(tweetfiles[0], "rb") as f:
            fout.write(f.read())
+        # other files without the header:
+        for file in tweetfiles[1:]:
+            with open(file, "rb") as f:
+                next(f)  # skip the header
+                fout.write(f.read())
 os.chdir(wd)

 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
+print(timeEndMerge.strftime(fTimeFormat))
 print("---")
 tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@@ -1,7 +1,9 @@
-def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
-    from datetime import datetime
-    currentTime = datetime.now
+def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
+    from datetime import datetime, time
+    import pandas as pd
    import snscrape.modules.twitter as sntwitter
+    
+    currentTime = datetime.now
    ts_beg = slice_data['beg_time']
    ts_end = slice_data['end_time']
    suffix = slice_data['suffix']
@@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
    # save short csv
    tweet_df.to_csv(csv_path, encoding='utf-8')
    # sleep 1 second to not get blocked because of excessive requests
-    # time.sleep(0.5)
+    time.sleep(0.5)