Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
27746cd886 | |||
02c3d055bd | |||
dc2e17cc2f |
126
collect.py
126
collect.py
@ -4,9 +4,9 @@ Created on Thu Jun 8 01:08:21 2023
|
|||||||
|
|
||||||
@author: Michael
|
@author: Michael
|
||||||
|
|
||||||
|
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
|
||||||
|
|
||||||
Following files are necessary:
|
Following files are necessary:
|
||||||
config.py
|
|
||||||
Used to configure everything that's needed for this script.
|
|
||||||
funs/TimeSlice.py
|
funs/TimeSlice.py
|
||||||
Function get_Tslices slices the defined timespan in config.py into N
|
Function get_Tslices slices the defined timespan in config.py into N
|
||||||
slices. Is necessary due to possible blocking of requests by twitter.
|
slices. Is necessary due to possible blocking of requests by twitter.
|
||||||
@ -15,6 +15,8 @@ Following files are necessary:
|
|||||||
Function deDupe reads each line of inFile and removes duplicate lines.
|
Function deDupe reads each line of inFile and removes duplicate lines.
|
||||||
A file outFile is saved without the duplicate lines. Generates
|
A file outFile is saved without the duplicate lines. Generates
|
||||||
"keywords.txt".
|
"keywords.txt".
|
||||||
|
funs/Scrape.py
|
||||||
|
scrapes using snscrape.modules.twitter. See docstring.
|
||||||
data/keywords-raw.txt
|
data/keywords-raw.txt
|
||||||
Contains all keywords that are used to detect whether a tweet contains
|
Contains all keywords that are used to detect whether a tweet contains
|
||||||
information about Covid19.
|
information about Covid19.
|
||||||
@ -60,21 +62,30 @@ import sys
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
|
||||||
## Setup directories
|
###################
|
||||||
|
# Setup directories
|
||||||
# WD Michael
|
# WD Michael
|
||||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
# WD Server
|
# WD Server
|
||||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
# Tweet-datafile output directory
|
# Tweet-datafile output directory
|
||||||
td = "data/tweets/"
|
td = "data/OUT/"
|
||||||
|
|
||||||
# Name of file that all tweets will be written to
|
# Name of file that all tweets will be written to
|
||||||
file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
path_to_tweetdfs = wd + td
|
path_to_tweetdfs = wd + td
|
||||||
|
|
||||||
## Define Timespan
|
# Name of logfile
|
||||||
|
logfile = wd+"log/log_"
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Define Timespan & time-format
|
||||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||||
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||||
@ -86,10 +97,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
|||||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||||
maxTweets = 5000
|
maxTweets = 5000
|
||||||
|
|
||||||
# Name of logfile
|
###################
|
||||||
logfile = wd+"log/log_"
|
# Install snscrape from local git repo to make shure that it fits the used version.
|
||||||
|
|
||||||
## Install snscrape from local git repo to make shure that it fits the used version.
|
|
||||||
# If snscrape is already installed, uncomment the following lines:
|
# If snscrape is already installed, uncomment the following lines:
|
||||||
"""
|
"""
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -98,7 +107,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
|||||||
os.chdir(wd)
|
os.chdir(wd)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Columns for tweet dataframe
|
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||||
|
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||||
|
# get subparams just like in user where user id can be obtained by user.id
|
||||||
tweetDFColumns = [
|
tweetDFColumns = [
|
||||||
"id",
|
"id",
|
||||||
"user.id",
|
"user.id",
|
||||||
@ -135,18 +146,22 @@ tweetDFColumns = [
|
|||||||
"source",
|
"source",
|
||||||
]
|
]
|
||||||
|
|
||||||
## Import other files
|
## Import functions
|
||||||
from funs.TimeSlice import *
|
from funs.TimeSlice import *
|
||||||
from funs.ClearDupes import deDupe
|
from funs.ClearDupes import deDupe
|
||||||
from funs.Scrape import scrapeTweets
|
from funs.Scrape import scrapeTweets
|
||||||
|
|
||||||
# create logfile & log all outputs
|
###################
|
||||||
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
|
# Create logfile & log all outputs
|
||||||
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
|
# there are three logfile types to be found in /log.
|
||||||
|
# should be self explanatory.
|
||||||
|
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
|
||||||
|
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
|
||||||
sys.stderr = open(logfileErrors, "w")
|
sys.stderr = open(logfileErrors, "w")
|
||||||
sys.stdout = open(logfilen, "w")
|
sys.stdout = open(logfilen, "w")
|
||||||
|
|
||||||
## Create List of time-period-slices
|
###################
|
||||||
|
# Create List of time-period-slices
|
||||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
||||||
# Print slices
|
# Print slices
|
||||||
print("Time-period-slices:")
|
print("Time-period-slices:")
|
||||||
@ -154,19 +169,22 @@ for slice in time_slices:
|
|||||||
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
|
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
|
||||||
print("---")
|
print("---")
|
||||||
|
|
||||||
## Keywords
|
###################
|
||||||
|
# Keywords
|
||||||
|
# read keywords from a file and write to list.
|
||||||
keywords = []
|
keywords = []
|
||||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||||
deDupe("data/keywords-raw.txt", "data/keywords.txt")
|
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
|
||||||
# Read the keywords from a file
|
# Read the keywords from a file
|
||||||
with open("data/keywords.txt", "r") as file:
|
with open(f"{di}keywords.txt", "r") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
keyword = line.strip() # Remove the newline character
|
keyword = line.strip() # Remove the newline character
|
||||||
keywords.append(keyword)
|
keywords.append(keyword)
|
||||||
print("---")
|
print("---")
|
||||||
|
|
||||||
## Senator Accounts
|
###################
|
||||||
|
# Senator Accounts
|
||||||
# Get accounts & alt-accounts from Senators-Datafile
|
# Get accounts & alt-accounts from Senators-Datafile
|
||||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
||||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
||||||
@ -181,52 +199,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
|
|||||||
print("\n")
|
print("\n")
|
||||||
print(f"\n{i} accounts in total.\n---")
|
print(f"\n{i} accounts in total.\n---")
|
||||||
|
|
||||||
## Scraping
|
###################
|
||||||
|
# Scraping
|
||||||
|
# report time:
|
||||||
timeStartScrape = datetime.now()
|
timeStartScrape = datetime.now()
|
||||||
print("Starting scraping at:")
|
print("Starting scraping at:")
|
||||||
print(timeStartScrape.strftime(fTimeFormat))
|
print(timeStartScrape.strftime(fTimeFormat))
|
||||||
print("---")
|
print("---")
|
||||||
|
|
||||||
# Iterate over each Twitter account using multiprocessing
|
# Iterate over each Twitter account using multiprocessing
|
||||||
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||||
# # List to store the scraping tasks
|
# List to store the scraping tasks
|
||||||
# tasks = []
|
tasks = []
|
||||||
# for handle in accounts:
|
for handle in accounts:
|
||||||
# # Iterate over each time slice
|
# Iterate over each time slice
|
||||||
# for slice_data in time_slices:
|
for slice_data in time_slices:
|
||||||
# # ... Code to prepare the slice_data ...
|
# ... Code to prepare the slice_data ...
|
||||||
# # Schedule the scraping task
|
# Schedule the scraping task
|
||||||
# task = executor.submit(
|
task = executor.submit(
|
||||||
# scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
|
scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix']
|
||||||
# )
|
)
|
||||||
# # Store the handle and slice_data as attributes of the task
|
# Store the handle and slice_data as attributes of the task
|
||||||
# # Wait for all tasks to complete
|
# Wait for all tasks to complete
|
||||||
# concurrent.futures.wait(tasks)
|
concurrent.futures.wait(tasks)
|
||||||
|
|
||||||
|
# report time:
|
||||||
timeEndScrape = datetime.now()
|
timeEndScrape = datetime.now()
|
||||||
print("---")
|
print("---")
|
||||||
print("End of scraping at:")
|
print("End of scraping at:")
|
||||||
print(timeEndScrape.strftime(fTimeFormat))
|
print(timeEndScrape.strftime(fTimeFormat))
|
||||||
|
|
||||||
## Merge CSV-Files to file_alltweets.
|
###################
|
||||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
# Merge CSV-Files to file_alltweets.
|
||||||
os.chdir(path_to_tweetdfs)
|
# fastest way is to save the slices seperately and then add every file to the
|
||||||
# At first check, whether all slices are present.
|
# output instead of using pandas or anything else.
|
||||||
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
|
||||||
|
## At first check, whether all slices are present.
|
||||||
|
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") ???
|
||||||
|
# Create list of all files that should be in the folder:
|
||||||
AllFilesList = []
|
AllFilesList = []
|
||||||
for handle in accounts:
|
for handle in accounts:
|
||||||
for tslice in time_slices:
|
for tslice in time_slices:
|
||||||
suffix = tslice['suffix']
|
suffix = tslice['suffix']
|
||||||
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
|
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
|
||||||
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
|
# report missing files to "log_*_missing.txt"
|
||||||
|
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
|
||||||
for file in AllFilesList:
|
for file in AllFilesList:
|
||||||
if file not in tweetfiles:
|
if file not in tweetfiles:
|
||||||
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||||
else:
|
else:
|
||||||
fout.write('all slices scraped.')
|
fout.write('all slices scraped.')
|
||||||
|
|
||||||
|
## Merge .csv files.
|
||||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
# check if file_alltweets (previously scraped tweets that have been merged
|
||||||
|
# into one file) exists in tweetfiles list, if it exists, remove from list
|
||||||
|
# to not include it in the following merge
|
||||||
if file_alltweets in tweetfiles:
|
if file_alltweets in tweetfiles:
|
||||||
tweetfiles.remove(file_alltweets)
|
tweetfiles.remove(file_alltweets)
|
||||||
# Go through all csv files and merge them into file_alltweets
|
# Go through all csv files and merge them into file_alltweets
|
||||||
@ -240,21 +267,24 @@ if tweetfiles:
|
|||||||
with open(file, "rb") as f:
|
with open(file, "rb") as f:
|
||||||
next(f) # skip the header
|
next(f) # skip the header
|
||||||
fout.write(f.read())
|
fout.write(f.read())
|
||||||
os.chdir(wd)
|
os.chdir(wd) # go back to wd
|
||||||
|
|
||||||
|
# Report timing info.
|
||||||
timeEndMerge = datetime.now()
|
timeEndMerge = datetime.now()
|
||||||
print("---")
|
print("---")
|
||||||
print("End of scraping at:")
|
print("End of scraping at:")
|
||||||
print(timeEndMerge.strftime(fTimeFormat))
|
print(timeEndMerge.strftime(fTimeFormat))
|
||||||
print("---")
|
print("---")
|
||||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
# calulate times:
|
||||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
|
||||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
|
||||||
|
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
|
||||||
print(
|
print(
|
||||||
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
||||||
)
|
)
|
||||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||||
|
|
||||||
|
# close connection to logfiles.
|
||||||
sys.stdout.close()
|
sys.stdout.close()
|
||||||
sys.stderr.close()
|
sys.stderr.close()
|
||||||
|
@ -1,140 +0,0 @@
|
|||||||
Coronavirus
|
|
||||||
Koronavirus
|
|
||||||
Corona
|
|
||||||
CDC
|
|
||||||
Wuhancoronavirus
|
|
||||||
Wuhanlockdown
|
|
||||||
Ncov
|
|
||||||
Wuhan
|
|
||||||
N95
|
|
||||||
Kungflu
|
|
||||||
Epidemic
|
|
||||||
outbreak
|
|
||||||
Sinophobia
|
|
||||||
China
|
|
||||||
covid-19
|
|
||||||
corona virus
|
|
||||||
covid
|
|
||||||
covid19
|
|
||||||
sars-cov-2
|
|
||||||
COVIDー19
|
|
||||||
COVD
|
|
||||||
pandemic
|
|
||||||
coronapocalypse
|
|
||||||
canceleverything
|
|
||||||
Coronials
|
|
||||||
SocialDistancingNow
|
|
||||||
Social Distancing
|
|
||||||
SocialDistancing
|
|
||||||
panicbuy
|
|
||||||
panic buy
|
|
||||||
panicbuying
|
|
||||||
panic buying
|
|
||||||
14DayQuarantine
|
|
||||||
DuringMy14DayQuarantine
|
|
||||||
panic shop
|
|
||||||
panic shopping
|
|
||||||
panicshop
|
|
||||||
InMyQuarantineSurvivalKit
|
|
||||||
panic-buy
|
|
||||||
panic-shop
|
|
||||||
coronakindness
|
|
||||||
quarantinelife
|
|
||||||
chinese virus
|
|
||||||
chinesevirus
|
|
||||||
stayhomechallenge
|
|
||||||
stay home challenge
|
|
||||||
sflockdown
|
|
||||||
DontBeASpreader
|
|
||||||
lockdown
|
|
||||||
lock down
|
|
||||||
shelteringinplace
|
|
||||||
sheltering in place
|
|
||||||
staysafestayhome
|
|
||||||
stay safe stay home
|
|
||||||
trumppandemic
|
|
||||||
trump pandemic
|
|
||||||
flattenthecurve
|
|
||||||
flatten the curve
|
|
||||||
china virus
|
|
||||||
chinavirus
|
|
||||||
quarentinelife
|
|
||||||
PPEshortage
|
|
||||||
saferathome
|
|
||||||
stayathome
|
|
||||||
stay at home
|
|
||||||
stay home
|
|
||||||
stayhome
|
|
||||||
GetMePPE
|
|
||||||
covidiot
|
|
||||||
epitwitter
|
|
||||||
pandemie
|
|
||||||
wear a mask
|
|
||||||
wearamask
|
|
||||||
kung flu
|
|
||||||
covididiot
|
|
||||||
COVID__19
|
|
||||||
omicron
|
|
||||||
variant
|
|
||||||
vaccine
|
|
||||||
travel ban
|
|
||||||
corona
|
|
||||||
coronavirus
|
|
||||||
sarscov2
|
|
||||||
sars cov2
|
|
||||||
sars cov 2
|
|
||||||
covid_19
|
|
||||||
ncov
|
|
||||||
ncov2019
|
|
||||||
2019-ncov
|
|
||||||
pandemic 2019ncov
|
|
||||||
2019ncov
|
|
||||||
quarantine
|
|
||||||
flattening the curve
|
|
||||||
flatteningthecurve
|
|
||||||
flattenthecurve
|
|
||||||
hand sanitizer
|
|
||||||
handsanitizer
|
|
||||||
social distancing
|
|
||||||
socialdistancing
|
|
||||||
work from home
|
|
||||||
workfromhome
|
|
||||||
working from home
|
|
||||||
workingfromhome
|
|
||||||
ppe
|
|
||||||
n95
|
|
||||||
covidiots
|
|
||||||
herd immunity
|
|
||||||
herdimmunity
|
|
||||||
pneumonia
|
|
||||||
wuhan virus
|
|
||||||
wuhanvirus
|
|
||||||
kungflu
|
|
||||||
vaccines
|
|
||||||
corona vaccine
|
|
||||||
corona vaccines
|
|
||||||
coronavaccine
|
|
||||||
coronavaccines
|
|
||||||
face shield
|
|
||||||
faceshield
|
|
||||||
face shields
|
|
||||||
faceshields
|
|
||||||
health worker
|
|
||||||
healthworker
|
|
||||||
health workers
|
|
||||||
healthworkers
|
|
||||||
stayhomestaysafe
|
|
||||||
coronaupdate
|
|
||||||
frontlineheroes
|
|
||||||
coronawarriors
|
|
||||||
homeschool
|
|
||||||
homeschooling
|
|
||||||
hometasking
|
|
||||||
masks4all
|
|
||||||
wfh
|
|
||||||
wash ur hands
|
|
||||||
wash your hands
|
|
||||||
washurhands
|
|
||||||
washyourhands
|
|
||||||
selfisolating
|
|
||||||
self isolating
|
|
24
data/tweets/.gitignore
vendored
24
data/tweets/.gitignore
vendored
@ -1,24 +0,0 @@
|
|||||||
/ALL-SENATORS-LONG-LONG.csv
|
|
||||||
/ALL-SENATORS.csv
|
|
||||||
/CoryGardner-LONG.csv
|
|
||||||
/CoryGardner.csv
|
|
||||||
/DavidPerdueGA-LONG.csv
|
|
||||||
/DavidPerdueGA.csv
|
|
||||||
/DougJones-LONG.csv
|
|
||||||
/DougJones.csv
|
|
||||||
/KLoeffler-LONG.csv
|
|
||||||
/KLoeffler.csv
|
|
||||||
/MarthaMcSallyAZ-LONG.csv
|
|
||||||
/MarthaMcSallyAZ.csv
|
|
||||||
/SenAlexander-LONG.csv
|
|
||||||
/SenAlexander.csv
|
|
||||||
/SenPatRoberts-LONG.csv
|
|
||||||
/SenPatRoberts.csv
|
|
||||||
/SenatorEnzi-LONG.csv
|
|
||||||
/SenatorEnzi.csv
|
|
||||||
/SenatorIsakson-LONG.csv
|
|
||||||
/SenatorIsakson.csv
|
|
||||||
/SenatorTomUdall-LONG.csv
|
|
||||||
/SenatorTomUdall.csv
|
|
||||||
/VP-LONG.csv
|
|
||||||
/VP.csv
|
|
@ -3,13 +3,22 @@ import time
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import snscrape.modules.twitter as sntwitter
|
import snscrape.modules.twitter as sntwitter
|
||||||
|
|
||||||
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
|
def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, maxTweets = 5000):
|
||||||
|
"""Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
handle (str): twitter handle of account to be scraped
|
||||||
|
keywords (list): list of strings containing the keywords that the tweets shall be searched for
|
||||||
|
td (str): tweet file output path
|
||||||
|
tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
|
||||||
|
ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||||
|
ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||||
|
suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
|
||||||
|
maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
|
||||||
|
"""
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
currentTime = datetime.now()
|
currentTime = datetime.now()
|
||||||
ts_beg = slice_data['beg_time']
|
|
||||||
ts_end = slice_data['end_time']
|
|
||||||
suffix = slice_data['suffix']
|
|
||||||
tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
|
tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
|
||||||
|
|
||||||
# create empty tweetlist that will be filled with tweets of current sen
|
# create empty tweetlist that will be filled with tweets of current sen
|
||||||
|
@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023
|
|||||||
|
|
||||||
# create slices
|
# create slices
|
||||||
def get_Tslices(ts_beg, ts_end, no_slices):
|
def get_Tslices(ts_beg, ts_end, no_slices):
|
||||||
|
"""Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ts_beg (datetime): Datetime start of overall period to be sliced.
|
||||||
|
ts_end (datetime): Datetime end of overall period to be sliced.
|
||||||
|
no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
|
||||||
|
"""
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
|
ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
|
||||||
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):
|
|||||||
|
|
||||||
# For log time conversions (seconds to days, hours, minutes)
|
# For log time conversions (seconds to days, hours, minutes)
|
||||||
def convertTime(duration):
|
def convertTime(duration):
|
||||||
|
"""Converts seconds to hours, minutes and seconds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
duration (int): seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: hours
|
||||||
|
int: minutes
|
||||||
|
int: seconds
|
||||||
|
"""
|
||||||
days, seconds = duration.days, duration.seconds
|
days, seconds = duration.days, duration.seconds
|
||||||
hours = days * 24 + seconds // 3600
|
hours = days * 24 + seconds // 3600
|
||||||
minutes = (seconds % 3600) // 60
|
minutes = (seconds % 3600) // 60
|
||||||
|
Reference in New Issue
Block a user