From 1b43b295ce36f4301369213c81e719192b664135 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 Jun 2023 17:47:23 +0200 Subject: [PATCH] adds filechecks --- collect.py | 51 +++++++++++++++++++++++++++----------------------- funs/Scrape.py | 10 ++++++---- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/collect.py b/collect.py index 256902c..dee5f0b 100644 --- a/collect.py +++ b/collect.py @@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z" # start of scraping ts_end = "2023-01-03T00:00:00Z" # end of straping no_slices = 24 # Number of slices / time periods. +# file time format +fTimeFormat = "%Y-%m-%d_%H-%M-%S" + # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 # Name of logfile -logfile = "log/log_" - +logfile = wd+"log/log_" ## Install snscrape from local git repo to make shure that it fits the used version. # If snscrape is already installed, uncomment the following lines: @@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe from funs.Scrape import scrapeTweets # create logfile & log all outputs -logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt" -logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt" +logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" +logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" sys.stderr = open(logfileErrors, "w") sys.stdout = open(logfilen, "w") @@ -169,19 +171,20 @@ print("---") accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields -accounts.append(alt_accounts) +accounts.extend(alt_accounts) + # Print accounts to be scraped print("Accounts to be scraped:") for i, acc in enumerate(accounts): # print 5 accounts per line print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars if i % 5 == 4: - print "\n" -print("\n---") + print("\n") +print(f"\n{i} accounts in total.\n---") ## Scraping timeStartScrape = datetime.now() print("Starting scraping at:") -print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S")) +print(timeStartScrape.strftime(fTimeFormat)) print("---") # Iterate over each Twitter account using multiprocessing @@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor: # ... code to prepare the slice_data ... # Schedule the scraping task - task = executor.submit(scrapeTweets, handle, slice_data, keywords, td) + task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns) tasks.append(task) # Wait for all tasks to complete @@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor: timeEndScrape = datetime.now() print("---") print("End of scraping at:") -print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S")) +print(timeEndScrape.strftime(fTimeFormat)) ## Merge CSV-Files to file_alltweets. # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. @@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs) tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") AllFilesList = [] for handle in accounts: - for suffix in time_slices: - AllFilesList.append("Tweets-{handle}{suffix}.csv") -with open(f"{logfile}missing-{timeStartScrape}") as fout: + for tslice in time_slices: + suffix = tslice['suffix'] + AllFilesList.append(f"Tweets-{handle}{suffix}.csv") +with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout: for file in AllFilesList: if file not in tweetfiles: - fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message. + fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message. # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge if file_alltweets in tweetfiles: tweetfiles.remove(file_alltweets) # Go through all csv files and merge them into file_alltweets -with open(file_alltweets, "wb") as fout: - # first file (because of the header): - with open(tweetfiles[0], "rb") as f: - fout.write(f.read()) - # other files without the header: - for file in tweetfiles[1:]: - with open(file, "rb") as f: - next(f) # skip the header +if len(tweetfiles) > 0: + with open(file_alltweets, "wb") as fout: + # first file (because of the header): + with open(tweetfiles[0], "rb") as f: fout.write(f.read()) + # other files without the header: + for file in tweetfiles[1:]: + with open(file, "rb") as f: + next(f) # skip the header + fout.write(f.read()) os.chdir(wd) timeEndMerge = datetime.now() print("---") print("End of scraping at:") -print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S")) +print(timeEndMerge.strftime(fTimeFormat)) print("---") tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) diff --git a/funs/Scrape.py b/funs/Scrape.py index 1ee4edb..8891b62 100644 --- a/funs/Scrape.py +++ b/funs/Scrape.py @@ -1,7 +1,9 @@ -def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000): - from datetime import datetime - currentTime = datetime.now +def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): + from datetime import datetime, time + import pandas as pd import snscrape.modules.twitter as sntwitter + + currentTime = datetime.now ts_beg = slice_data['beg_time'] ts_end = slice_data['end_time'] suffix = slice_data['suffix'] @@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000): # save short csv tweet_df.to_csv(csv_path, encoding='utf-8') # sleep 1 second to not get blocked because of excessive requests - # time.sleep(0.5) \ No newline at end of file + time.sleep(0.5) \ No newline at end of file