3 Commits
0.1.0 ... 0.1.2

7 changed files with 111 additions and 216 deletions

View File

@ -4,9 +4,9 @@ Created on Thu Jun 8 01:08:21 2023
@author: Michael @author: Michael
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
Following files are necessary: Following files are necessary:
config.py
Used to configure everything that's needed for this script.
funs/TimeSlice.py funs/TimeSlice.py
Function get_Tslices slices the defined timespan in config.py into N Function get_Tslices slices the defined timespan in config.py into N
slices. Is necessary due to possible blocking of requests by twitter. slices. Is necessary due to possible blocking of requests by twitter.
@ -15,6 +15,8 @@ Following files are necessary:
Function deDupe reads each line of inFile and removes duplicate lines. Function deDupe reads each line of inFile and removes duplicate lines.
A file outFile is saved without the duplicate lines. Generates A file outFile is saved without the duplicate lines. Generates
"keywords.txt". "keywords.txt".
funs/Scrape.py
scrapes using snscrape.modules.twitter. See docstring.
data/keywords-raw.txt data/keywords-raw.txt
Contains all keywords that are used to detect whether a tweet contains Contains all keywords that are used to detect whether a tweet contains
information about Covid19. information about Covid19.
@ -60,21 +62,30 @@ import sys
from datetime import datetime from datetime import datetime
import concurrent.futures import concurrent.futures
## Setup directories ###################
# Setup directories
# WD Michael # WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/" wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server # WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory # Tweet-datafile output directory
td = "data/tweets/" td = "data/OUT/"
# Name of file that all tweets will be written to # Name of file that all tweets will be written to
file_alltweets = "ALL-SENATORS-TWEETS.csv" file_alltweets = "ALL-SENATORS-TWEETS.csv"
# don't change this one
path_to_tweetdfs = wd + td path_to_tweetdfs = wd + td
## Define Timespan # Name of logfile
logfile = wd+"log/log_"
###################
# Define Timespan & time-format
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z" # start of scraping ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping ts_end = "2023-01-03T00:00:00Z" # end of straping
@ -86,10 +97,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched. # Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000 maxTweets = 5000
# Name of logfile ###################
logfile = wd+"log/log_" # Install snscrape from local git repo to make shure that it fits the used version.
## Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines: # If snscrape is already installed, uncomment the following lines:
""" """
import subprocess import subprocess
@ -98,7 +107,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd) os.chdir(wd)
""" """
# Columns for tweet dataframe # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
# get subparams just like in user where user id can be obtained by user.id
tweetDFColumns = [ tweetDFColumns = [
"id", "id",
"user.id", "user.id",
@ -135,18 +146,22 @@ tweetDFColumns = [
"source", "source",
] ]
## Import other files ## Import functions
from funs.TimeSlice import * from funs.TimeSlice import *
from funs.ClearDupes import deDupe from funs.ClearDupes import deDupe
from funs.Scrape import scrapeTweets from funs.Scrape import scrapeTweets
# create logfile & log all outputs ###################
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" # Create logfile & log all outputs
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" # there are three logfile types to be found in /log.
# should be self explanatory.
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
sys.stderr = open(logfileErrors, "w") sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w") sys.stdout = open(logfilen, "w")
## Create List of time-period-slices ###################
# Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices) time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices # Print slices
print("Time-period-slices:") print("Time-period-slices:")
@ -154,19 +169,22 @@ for slice in time_slices:
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"]) print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
print("---") print("---")
## Keywords ###################
# Keywords
# read keywords from a file and write to list.
keywords = [] keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe("data/keywords-raw.txt", "data/keywords.txt") deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
# Read the keywords from a file # Read the keywords from a file
with open("data/keywords.txt", "r") as file: with open(f"{di}keywords.txt", "r") as file:
lines = file.readlines() lines = file.readlines()
for line in lines: for line in lines:
keyword = line.strip() # Remove the newline character keyword = line.strip() # Remove the newline character
keywords.append(keyword) keywords.append(keyword)
print("---") print("---")
## Senator Accounts ###################
# Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile # Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
@ -181,52 +199,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
print("\n") print("\n")
print(f"\n{i} accounts in total.\n---") print(f"\n{i} accounts in total.\n---")
## Scraping ###################
# Scraping
# report time:
timeStartScrape = datetime.now() timeStartScrape = datetime.now()
print("Starting scraping at:") print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat)) print(timeStartScrape.strftime(fTimeFormat))
print("---") print("---")
# Iterate over each Twitter account using multiprocessing # Iterate over each Twitter account using multiprocessing
# with concurrent.futures.ProcessPoolExecutor() as executor: with concurrent.futures.ProcessPoolExecutor() as executor:
# # List to store the scraping tasks # List to store the scraping tasks
# tasks = [] tasks = []
# for handle in accounts: for handle in accounts:
# # Iterate over each time slice # Iterate over each time slice
# for slice_data in time_slices: for slice_data in time_slices:
# # ... Code to prepare the slice_data ... # ... Code to prepare the slice_data ...
# # Schedule the scraping task # Schedule the scraping task
# task = executor.submit( task = executor.submit(
# scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix']
# ) )
# # Store the handle and slice_data as attributes of the task # Store the handle and slice_data as attributes of the task
# # Wait for all tasks to complete # Wait for all tasks to complete
# concurrent.futures.wait(tasks) concurrent.futures.wait(tasks)
# report time:
timeEndScrape = datetime.now() timeEndScrape = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat)) print(timeEndScrape.strftime(fTimeFormat))
## Merge CSV-Files to file_alltweets. ###################
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. # Merge CSV-Files to file_alltweets.
os.chdir(path_to_tweetdfs) # fastest way is to save the slices seperately and then add every file to the
# At first check, whether all slices are present. # output instead of using pandas or anything else.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
## At first check, whether all slices are present.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") ???
# Create list of all files that should be in the folder:
AllFilesList = [] AllFilesList = []
for handle in accounts: for handle in accounts:
for tslice in time_slices: for tslice in time_slices:
suffix = tslice['suffix'] suffix = tslice['suffix']
AllFilesList.append(f"Tweets-{handle}{suffix}.csv") AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout: # report missing files to "log_*_missing.txt"
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
for file in AllFilesList: for file in AllFilesList:
if file not in tweetfiles: if file not in tweetfiles:
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message. fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
else: else:
fout.write('all slices scraped.') fout.write('all slices scraped.')
## Merge .csv files.
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge # check if file_alltweets (previously scraped tweets that have been merged
# into one file) exists in tweetfiles list, if it exists, remove from list
# to not include it in the following merge
if file_alltweets in tweetfiles: if file_alltweets in tweetfiles:
tweetfiles.remove(file_alltweets) tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets # Go through all csv files and merge them into file_alltweets
@ -240,21 +267,24 @@ if tweetfiles:
with open(file, "rb") as f: with open(file, "rb") as f:
next(f) # skip the header next(f) # skip the header
fout.write(f.read()) fout.write(f.read())
os.chdir(wd) os.chdir(wd) # go back to wd
# Report timing info.
timeEndMerge = datetime.now() timeEndMerge = datetime.now()
print("---") print("---")
print("End of scraping at:") print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat)) print(timeEndMerge.strftime(fTimeFormat))
print("---") print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # calulate times:
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
print( print(
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
) )
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
# close connection to logfiles.
sys.stdout.close() sys.stdout.close()
sys.stderr.close() sys.stderr.close()

View File

@ -1,140 +0,0 @@
Coronavirus
Koronavirus
Corona
CDC
Wuhancoronavirus
Wuhanlockdown
Ncov
Wuhan
N95
Kungflu
Epidemic
outbreak
Sinophobia
China
covid-19
corona virus
covid
covid19
sars-cov-2
COVIDー19
COVD
pandemic
coronapocalypse
canceleverything
Coronials
SocialDistancingNow
Social Distancing
SocialDistancing
panicbuy
panic buy
panicbuying
panic buying
14DayQuarantine
DuringMy14DayQuarantine
panic shop
panic shopping
panicshop
InMyQuarantineSurvivalKit
panic-buy
panic-shop
coronakindness
quarantinelife
chinese virus
chinesevirus
stayhomechallenge
stay home challenge
sflockdown
DontBeASpreader
lockdown
lock down
shelteringinplace
sheltering in place
staysafestayhome
stay safe stay home
trumppandemic
trump pandemic
flattenthecurve
flatten the curve
china virus
chinavirus
quarentinelife
PPEshortage
saferathome
stayathome
stay at home
stay home
stayhome
GetMePPE
covidiot
epitwitter
pandemie
wear a mask
wearamask
kung flu
covididiot
COVID__19
omicron
variant
vaccine
travel ban
corona
coronavirus
sarscov2
sars cov2
sars cov 2
covid_19
ncov
ncov2019
2019-ncov
pandemic 2019ncov
2019ncov
quarantine
flattening the curve
flatteningthecurve
flattenthecurve
hand sanitizer
handsanitizer
social distancing
socialdistancing
work from home
workfromhome
working from home
workingfromhome
ppe
n95
covidiots
herd immunity
herdimmunity
pneumonia
wuhan virus
wuhanvirus
kungflu
vaccines
corona vaccine
corona vaccines
coronavaccine
coronavaccines
face shield
faceshield
face shields
faceshields
health worker
healthworker
health workers
healthworkers
stayhomestaysafe
coronaupdate
frontlineheroes
coronawarriors
homeschool
homeschooling
hometasking
masks4all
wfh
wash ur hands
wash your hands
washurhands
washyourhands
selfisolating
self isolating

View File

@ -1,24 +0,0 @@
/ALL-SENATORS-LONG-LONG.csv
/ALL-SENATORS.csv
/CoryGardner-LONG.csv
/CoryGardner.csv
/DavidPerdueGA-LONG.csv
/DavidPerdueGA.csv
/DougJones-LONG.csv
/DougJones.csv
/KLoeffler-LONG.csv
/KLoeffler.csv
/MarthaMcSallyAZ-LONG.csv
/MarthaMcSallyAZ.csv
/SenAlexander-LONG.csv
/SenAlexander.csv
/SenPatRoberts-LONG.csv
/SenPatRoberts.csv
/SenatorEnzi-LONG.csv
/SenatorEnzi.csv
/SenatorIsakson-LONG.csv
/SenatorIsakson.csv
/SenatorTomUdall-LONG.csv
/SenatorTomUdall.csv
/VP-LONG.csv
/VP.csv

View File

@ -3,13 +3,22 @@ import time
import pandas as pd import pandas as pd
import snscrape.modules.twitter as sntwitter import snscrape.modules.twitter as sntwitter
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, maxTweets = 5000):
"""Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
Args:
handle (str): twitter handle of account to be scraped
keywords (list): list of strings containing the keywords that the tweets shall be searched for
td (str): tweet file output path
tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
"""
i = 0 i = 0
currentTime = datetime.now() currentTime = datetime.now()
ts_beg = slice_data['beg_time']
ts_end = slice_data['end_time']
suffix = slice_data['suffix']
tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv" tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
# create empty tweetlist that will be filled with tweets of current sen # create empty tweetlist that will be filled with tweets of current sen

View File

@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023
# create slices # create slices
def get_Tslices(ts_beg, ts_end, no_slices): def get_Tslices(ts_beg, ts_end, no_slices):
"""Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
Args:
ts_beg (datetime): Datetime start of overall period to be sliced.
ts_end (datetime): Datetime end of overall period to be sliced.
no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
Returns:
list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
"""
from datetime import datetime from datetime import datetime
from datetime import timedelta from datetime import timedelta
ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ') ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):
# For log time conversions (seconds to days, hours, minutes) # For log time conversions (seconds to days, hours, minutes)
def convertTime(duration): def convertTime(duration):
"""Converts seconds to hours, minutes and seconds.
Args:
duration (int): seconds
Returns:
int: hours
int: minutes
int: seconds
"""
days, seconds = duration.days, duration.seconds days, seconds = duration.days, duration.seconds
hours = days * 24 + seconds // 3600 hours = days * 24 + seconds // 3600
minutes = (seconds % 3600) // 60 minutes = (seconds % 3600) // 60