adds filechecks
This commit is contained in:
parent
fb7a70cf66
commit
1b43b295ce
35
collect.py
35
collect.py
@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||
no_slices = 24 # Number of slices / time periods.
|
||||
|
||||
# file time format
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
|
||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||
maxTweets = 5000
|
||||
|
||||
# Name of logfile
|
||||
logfile = "log/log_"
|
||||
|
||||
logfile = wd+"log/log_"
|
||||
|
||||
## Install snscrape from local git repo to make shure that it fits the used version.
|
||||
# If snscrape is already installed, uncomment the following lines:
|
||||
@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe
|
||||
from funs.Scrape import scrapeTweets
|
||||
|
||||
# create logfile & log all outputs
|
||||
logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
|
||||
logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
|
||||
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
|
||||
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
|
||||
sys.stderr = open(logfileErrors, "w")
|
||||
sys.stdout = open(logfilen, "w")
|
||||
|
||||
@ -169,19 +171,20 @@ print("---")
|
||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||
accounts.append(alt_accounts)
|
||||
accounts.extend(alt_accounts)
|
||||
|
||||
# Print accounts to be scraped
|
||||
print("Accounts to be scraped:")
|
||||
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
|
||||
if i % 5 == 4:
|
||||
print "\n"
|
||||
print("\n---")
|
||||
print("\n")
|
||||
print(f"\n{i} accounts in total.\n---")
|
||||
|
||||
## Scraping
|
||||
timeStartScrape = datetime.now()
|
||||
print("Starting scraping at:")
|
||||
print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
print(timeStartScrape.strftime(fTimeFormat))
|
||||
print("---")
|
||||
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# ... code to prepare the slice_data ...
|
||||
|
||||
# Schedule the scraping task
|
||||
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
|
||||
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
timeEndScrape = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
print(timeEndScrape.strftime(fTimeFormat))
|
||||
|
||||
## Merge CSV-Files to file_alltweets.
|
||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
||||
@ -213,16 +216,18 @@ os.chdir(path_to_tweetdfs)
|
||||
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
||||
AllFilesList = []
|
||||
for handle in accounts:
|
||||
for suffix in time_slices:
|
||||
AllFilesList.append("Tweets-{handle}{suffix}.csv")
|
||||
with open(f"{logfile}missing-{timeStartScrape}") as fout:
|
||||
for tslice in time_slices:
|
||||
suffix = tslice['suffix']
|
||||
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
|
||||
with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
|
||||
for file in AllFilesList:
|
||||
if file not in tweetfiles:
|
||||
fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
||||
if file_alltweets in tweetfiles:
|
||||
tweetfiles.remove(file_alltweets)
|
||||
# Go through all csv files and merge them into file_alltweets
|
||||
if len(tweetfiles) > 0:
|
||||
with open(file_alltweets, "wb") as fout:
|
||||
# first file (because of the header):
|
||||
with open(tweetfiles[0], "rb") as f:
|
||||
@ -237,7 +242,7 @@ os.chdir(wd)
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
print(timeEndMerge.strftime(fTimeFormat))
|
||||
print("---")
|
||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
||||
|
@ -1,7 +1,9 @@
|
||||
def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
|
||||
from datetime import datetime
|
||||
currentTime = datetime.now
|
||||
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
|
||||
from datetime import datetime, time
|
||||
import pandas as pd
|
||||
import snscrape.modules.twitter as sntwitter
|
||||
|
||||
currentTime = datetime.now
|
||||
ts_beg = slice_data['beg_time']
|
||||
ts_end = slice_data['end_time']
|
||||
suffix = slice_data['suffix']
|
||||
@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
|
||||
# save short csv
|
||||
tweet_df.to_csv(csv_path, encoding='utf-8')
|
||||
# sleep 1 second to not get blocked because of excessive requests
|
||||
# time.sleep(0.5)
|
||||
time.sleep(0.5)
|
Loading…
x
Reference in New Issue
Block a user