adds filechecks
This commit is contained in:
parent
fb7a70cf66
commit
1b43b295ce
51
collect.py
51
collect.py
@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
|||||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||||
no_slices = 24 # Number of slices / time periods.
|
no_slices = 24 # Number of slices / time periods.
|
||||||
|
|
||||||
|
# file time format
|
||||||
|
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||||
|
|
||||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||||
maxTweets = 5000
|
maxTweets = 5000
|
||||||
|
|
||||||
# Name of logfile
|
# Name of logfile
|
||||||
logfile = "log/log_"
|
logfile = wd+"log/log_"
|
||||||
|
|
||||||
|
|
||||||
## Install snscrape from local git repo to make shure that it fits the used version.
|
## Install snscrape from local git repo to make shure that it fits the used version.
|
||||||
# If snscrape is already installed, uncomment the following lines:
|
# If snscrape is already installed, uncomment the following lines:
|
||||||
@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe
|
|||||||
from funs.Scrape import scrapeTweets
|
from funs.Scrape import scrapeTweets
|
||||||
|
|
||||||
# create logfile & log all outputs
|
# create logfile & log all outputs
|
||||||
logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
|
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
|
||||||
logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
|
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
|
||||||
sys.stderr = open(logfileErrors, "w")
|
sys.stderr = open(logfileErrors, "w")
|
||||||
sys.stdout = open(logfilen, "w")
|
sys.stdout = open(logfilen, "w")
|
||||||
|
|
||||||
@ -169,19 +171,20 @@ print("---")
|
|||||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
||||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
||||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||||
accounts.append(alt_accounts)
|
accounts.extend(alt_accounts)
|
||||||
|
|
||||||
# Print accounts to be scraped
|
# Print accounts to be scraped
|
||||||
print("Accounts to be scraped:")
|
print("Accounts to be scraped:")
|
||||||
for i, acc in enumerate(accounts): # print 5 accounts per line
|
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||||
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
|
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
|
||||||
if i % 5 == 4:
|
if i % 5 == 4:
|
||||||
print "\n"
|
print("\n")
|
||||||
print("\n---")
|
print(f"\n{i} accounts in total.\n---")
|
||||||
|
|
||||||
## Scraping
|
## Scraping
|
||||||
timeStartScrape = datetime.now()
|
timeStartScrape = datetime.now()
|
||||||
print("Starting scraping at:")
|
print("Starting scraping at:")
|
||||||
print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
print(timeStartScrape.strftime(fTimeFormat))
|
||||||
print("---")
|
print("---")
|
||||||
|
|
||||||
# Iterate over each Twitter account using multiprocessing
|
# Iterate over each Twitter account using multiprocessing
|
||||||
@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
|
|||||||
# ... code to prepare the slice_data ...
|
# ... code to prepare the slice_data ...
|
||||||
|
|
||||||
# Schedule the scraping task
|
# Schedule the scraping task
|
||||||
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
|
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns)
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
|
|
||||||
# Wait for all tasks to complete
|
# Wait for all tasks to complete
|
||||||
@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
|
|||||||
timeEndScrape = datetime.now()
|
timeEndScrape = datetime.now()
|
||||||
print("---")
|
print("---")
|
||||||
print("End of scraping at:")
|
print("End of scraping at:")
|
||||||
print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
print(timeEndScrape.strftime(fTimeFormat))
|
||||||
|
|
||||||
## Merge CSV-Files to file_alltweets.
|
## Merge CSV-Files to file_alltweets.
|
||||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
||||||
@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs)
|
|||||||
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
||||||
AllFilesList = []
|
AllFilesList = []
|
||||||
for handle in accounts:
|
for handle in accounts:
|
||||||
for suffix in time_slices:
|
for tslice in time_slices:
|
||||||
AllFilesList.append("Tweets-{handle}{suffix}.csv")
|
suffix = tslice['suffix']
|
||||||
with open(f"{logfile}missing-{timeStartScrape}") as fout:
|
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
|
||||||
|
with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
|
||||||
for file in AllFilesList:
|
for file in AllFilesList:
|
||||||
if file not in tweetfiles:
|
if file not in tweetfiles:
|
||||||
fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
||||||
if file_alltweets in tweetfiles:
|
if file_alltweets in tweetfiles:
|
||||||
tweetfiles.remove(file_alltweets)
|
tweetfiles.remove(file_alltweets)
|
||||||
# Go through all csv files and merge them into file_alltweets
|
# Go through all csv files and merge them into file_alltweets
|
||||||
with open(file_alltweets, "wb") as fout:
|
if len(tweetfiles) > 0:
|
||||||
# first file (because of the header):
|
with open(file_alltweets, "wb") as fout:
|
||||||
with open(tweetfiles[0], "rb") as f:
|
# first file (because of the header):
|
||||||
fout.write(f.read())
|
with open(tweetfiles[0], "rb") as f:
|
||||||
# other files without the header:
|
|
||||||
for file in tweetfiles[1:]:
|
|
||||||
with open(file, "rb") as f:
|
|
||||||
next(f) # skip the header
|
|
||||||
fout.write(f.read())
|
fout.write(f.read())
|
||||||
|
# other files without the header:
|
||||||
|
for file in tweetfiles[1:]:
|
||||||
|
with open(file, "rb") as f:
|
||||||
|
next(f) # skip the header
|
||||||
|
fout.write(f.read())
|
||||||
os.chdir(wd)
|
os.chdir(wd)
|
||||||
|
|
||||||
timeEndMerge = datetime.now()
|
timeEndMerge = datetime.now()
|
||||||
print("---")
|
print("---")
|
||||||
print("End of scraping at:")
|
print("End of scraping at:")
|
||||||
print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
|
print(timeEndMerge.strftime(fTimeFormat))
|
||||||
print("---")
|
print("---")
|
||||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
||||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
|
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
|
||||||
from datetime import datetime
|
from datetime import datetime, time
|
||||||
currentTime = datetime.now
|
import pandas as pd
|
||||||
import snscrape.modules.twitter as sntwitter
|
import snscrape.modules.twitter as sntwitter
|
||||||
|
|
||||||
|
currentTime = datetime.now
|
||||||
ts_beg = slice_data['beg_time']
|
ts_beg = slice_data['beg_time']
|
||||||
ts_end = slice_data['end_time']
|
ts_end = slice_data['end_time']
|
||||||
suffix = slice_data['suffix']
|
suffix = slice_data['suffix']
|
||||||
@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000):
|
|||||||
# save short csv
|
# save short csv
|
||||||
tweet_df.to_csv(csv_path, encoding='utf-8')
|
tweet_df.to_csv(csv_path, encoding='utf-8')
|
||||||
# sleep 1 second to not get blocked because of excessive requests
|
# sleep 1 second to not get blocked because of excessive requests
|
||||||
# time.sleep(0.5)
|
time.sleep(0.5)
|
Loading…
x
Reference in New Issue
Block a user