adds checks & logs
This commit is contained in:
parent
7e8666f094
commit
599202ae4d
42
collect.py
42
collect.py
@ -56,19 +56,29 @@ import os
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import glob
|
import glob
|
||||||
import time
|
import time
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
## Import other files
|
## Import other files
|
||||||
from config import *
|
from config import *
|
||||||
import snscrape.modules.twitter as sntwitter
|
import snscrape.modules.twitter as sntwitter
|
||||||
from funs.TimeSlice import get_Tslices
|
from funs.TimeSlice import *
|
||||||
from funs.ClearDupes import deDupe
|
from funs.ClearDupes import deDupe
|
||||||
|
|
||||||
|
# create logfile & log all outputs
|
||||||
|
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
|
||||||
|
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
|
||||||
|
sys.stderr = open(logfileErrors, 'w')
|
||||||
|
sys.stdout = open(logfilen, 'w')
|
||||||
|
|
||||||
## Create List of time-period-slices
|
## Create List of time-period-slices
|
||||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
||||||
# Print slices
|
# Print slices
|
||||||
print('Time-period-slices:')
|
print('Time-period-slices:')
|
||||||
for slice in time_slices:
|
for slice in time_slices:
|
||||||
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
||||||
|
print('---')
|
||||||
|
|
||||||
|
|
||||||
## Keywords
|
## Keywords
|
||||||
keywords = []
|
keywords = []
|
||||||
@ -80,13 +90,23 @@ with open('data/keywords.txt', 'r') as file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
keyword = line.strip() # Remove the newline character
|
keyword = line.strip() # Remove the newline character
|
||||||
keywords.append(keyword)
|
keywords.append(keyword)
|
||||||
|
print('---')
|
||||||
|
|
||||||
## Senator Accounts
|
## Senator Accounts
|
||||||
# Get accounts & alt-accounts from Senators-Datafile
|
# Get accounts & alt-accounts from Senators-Datafile
|
||||||
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
||||||
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
||||||
|
print('Accounts to be scraped:')
|
||||||
|
print(accounts)
|
||||||
|
print(alt_accounts)
|
||||||
|
print('---')
|
||||||
|
|
||||||
## Scraping
|
## Scraping
|
||||||
|
timeStartScrape = datetime.now()
|
||||||
|
print("Starting scraping at:")
|
||||||
|
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||||
|
print('---')
|
||||||
|
|
||||||
# Iterate over each Twitter account
|
# Iterate over each Twitter account
|
||||||
for handle in accounts:
|
for handle in accounts:
|
||||||
# Iterate over each time slice
|
# Iterate over each time slice
|
||||||
@ -198,6 +218,11 @@ for handle in accounts:
|
|||||||
# sleep 1 second to not get blocked because of excessive requests
|
# sleep 1 second to not get blocked because of excessive requests
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
|
timeEndScrape = datetime.now()
|
||||||
|
print("---")
|
||||||
|
print("End of scraping at:")
|
||||||
|
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||||
|
|
||||||
## Merge CSV-Files to file_alltweets
|
## Merge CSV-Files to file_alltweets
|
||||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
||||||
os.chdir(path_to_tweetdfs)
|
os.chdir(path_to_tweetdfs)
|
||||||
@ -217,3 +242,18 @@ with open(file_alltweets,"wb") as fout:
|
|||||||
next(f) # skip the header
|
next(f) # skip the header
|
||||||
fout.write(f.read())
|
fout.write(f.read())
|
||||||
os.chdir(wd)
|
os.chdir(wd)
|
||||||
|
|
||||||
|
timeEndMerge = datetime.now()
|
||||||
|
print("---")
|
||||||
|
print("End of scraping at:")
|
||||||
|
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||||
|
print("---")
|
||||||
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
||||||
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
||||||
|
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
|
||||||
|
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
|
||||||
|
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||||
|
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||||
|
|
||||||
|
sys.stdout.close()
|
||||||
|
sys.stderr.close()
|
@ -29,6 +29,9 @@ no_slices = 24 # Number of slices / time periods.
|
|||||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||||
maxTweets = 5000
|
maxTweets = 5000
|
||||||
|
|
||||||
|
# Name of logfile
|
||||||
|
logfile = 'log/log_'
|
||||||
|
|
||||||
|
|
||||||
## Install snscrape from local git repo to make shure that it fits the used version.
|
## Install snscrape from local git repo to make shure that it fits the used version.
|
||||||
# If snscrape is already installed, uncomment the following lines:
|
# If snscrape is already installed, uncomment the following lines:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user