adds filechecks

This commit is contained in:
Michael Beck 2023-06-23 17:47:23 +02:00
parent fb7a70cf66
commit 1b43b295ce
2 changed files with 34 additions and 27 deletions

View File

@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping ts_end = "2023-01-03T00:00:00Z" # end of straping
no_slices = 24 # Number of slices / time periods. no_slices = 24 # Number of slices / time periods.
# file time format
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched. # Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000 maxTweets = 5000
# Name of logfile # Name of logfile
logfile = "log/log_" logfile = wd+"log/log_"
## Install snscrape from local git repo to make shure that it fits the used version. ## Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines: # If snscrape is already installed, uncomment the following lines:
@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe
from funs.Scrape import scrapeTweets from funs.Scrape import scrapeTweets
# create logfile & log all outputs # create logfile & log all outputs
logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt" logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt" logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
sys.stderr = open(logfileErrors, "w") sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w") sys.stdout = open(logfilen, "w")
@ -169,19 +171,20 @@ print("---")
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
accounts.append(alt_accounts) accounts.extend(alt_accounts)
# Print accounts to be scraped # Print accounts to be scraped
print("Accounts to be scraped:") print("Accounts to be scraped:")
for i, acc in enumerate(accounts): # print 5 accounts per line for i, acc in enumerate(accounts): # print 5 accounts per line
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
if i % 5 == 4: if i % 5 == 4:
print "\n" print("\n")
print("\n---") print(f"\n{i} accounts in total.\n---")
## Scraping ## Scraping
timeStartScrape = datetime.now() timeStartScrape = datetime.now()
print("Starting scraping at:") print("Starting scraping at:")
print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S")) print(timeStartScrape.strftime(fTimeFormat))
print("---") print("---")
# Iterate over each Twitter account using multiprocessing # Iterate over each Twitter account using multiprocessing
@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
# ... code to prepare the slice_data ... # ... code to prepare the slice_data ...
# Schedule the scraping task # Schedule the scraping task
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td) task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns)
tasks.append(task) tasks.append(task)
# Wait for all tasks to complete # Wait for all tasks to complete
@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
timeEndScrape = datetime.now() timeEndScrape = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S")) print(timeEndScrape.strftime(fTimeFormat))
## Merge CSV-Files to file_alltweets. ## Merge CSV-Files to file_alltweets.
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs)
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
AllFilesList = [] AllFilesList = []
for handle in accounts: for handle in accounts:
for suffix in time_slices: for tslice in time_slices:
AllFilesList.append("Tweets-{handle}{suffix}.csv") suffix = tslice['suffix']
with open(f"{logfile}missing-{timeStartScrape}") as fout: AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
for file in AllFilesList: for file in AllFilesList:
if file not in tweetfiles: if file not in tweetfiles:
fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message. fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
if file_alltweets in tweetfiles: if file_alltweets in tweetfiles:
tweetfiles.remove(file_alltweets) tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets # Go through all csv files and merge them into file_alltweets
with open(file_alltweets, "wb") as fout: if len(tweetfiles) > 0:
# first file (because of the header): with open(file_alltweets, "wb") as fout:
with open(tweetfiles[0], "rb") as f: # first file (because of the header):
fout.write(f.read()) with open(tweetfiles[0], "rb") as f:
# other files without the header:
for file in tweetfiles[1:]:
with open(file, "rb") as f:
next(f) # skip the header
fout.write(f.read()) fout.write(f.read())
# other files without the header:
for file in tweetfiles[1:]:
with open(file, "rb") as f:
next(f) # skip the header
fout.write(f.read())
os.chdir(wd) os.chdir(wd)
timeEndMerge = datetime.now() timeEndMerge = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S")) print(timeEndMerge.strftime(fTimeFormat))
print("---") print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)

View File

@ -1,7 +1,9 @@
def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000): def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
from datetime import datetime from datetime import datetime, time
currentTime = datetime.now import pandas as pd
import snscrape.modules.twitter as sntwitter import snscrape.modules.twitter as sntwitter
currentTime = datetime.now
ts_beg = slice_data['beg_time'] ts_beg = slice_data['beg_time']
ts_end = slice_data['end_time'] ts_end = slice_data['end_time']
suffix = slice_data['suffix'] suffix = slice_data['suffix']
@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
# save short csv # save short csv
tweet_df.to_csv(csv_path, encoding='utf-8') tweet_df.to_csv(csv_path, encoding='utf-8')
# sleep 1 second to not get blocked because of excessive requests # sleep 1 second to not get blocked because of excessive requests
# time.sleep(0.5) time.sleep(0.5)