3 Commits
0.1.0 ... 0.1.2

7 changed files with 111 additions and 216 deletions

View File

@ -4,9 +4,9 @@ Created on Thu Jun 8 01:08:21 2023
@author: Michael
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
Following files are necessary:
config.py
Used to configure everything that's needed for this script.
funs/TimeSlice.py
Function get_Tslices slices the defined timespan in config.py into N
slices. Is necessary due to possible blocking of requests by twitter.
@ -15,6 +15,8 @@ Following files are necessary:
Function deDupe reads each line of inFile and removes duplicate lines.
A file outFile is saved without the duplicate lines. Generates
"keywords.txt".
funs/Scrape.py
scrapes using snscrape.modules.twitter. See docstring.
data/keywords-raw.txt
Contains all keywords that are used to detect whether a tweet contains
information about Covid19.
@ -60,21 +62,30 @@ import sys
from datetime import datetime
import concurrent.futures
## Setup directories
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
td = "data/tweets/"
td = "data/OUT/"
# Name of file that all tweets will be written to
file_alltweets = "ALL-SENATORS-TWEETS.csv"
# don't change this one
path_to_tweetdfs = wd + td
## Define Timespan
# Name of logfile
logfile = wd+"log/log_"
###################
# Define Timespan & time-format
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
ts_end = "2023-01-03T00:00:00Z" # end of straping
@ -86,10 +97,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
# Maximum tweets to be scraped by snscrape. Can be left untouched.
maxTweets = 5000
# Name of logfile
logfile = wd+"log/log_"
## Install snscrape from local git repo to make shure that it fits the used version.
###################
# Install snscrape from local git repo to make shure that it fits the used version.
# If snscrape is already installed, uncomment the following lines:
"""
import subprocess
@ -98,7 +107,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
os.chdir(wd)
"""
# Columns for tweet dataframe
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
# get subparams just like in user where user id can be obtained by user.id
tweetDFColumns = [
"id",
"user.id",
@ -135,18 +146,22 @@ tweetDFColumns = [
"source",
]
## Import other files
## Import functions
from funs.TimeSlice import *
from funs.ClearDupes import deDupe
from funs.Scrape import scrapeTweets
# create logfile & log all outputs
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
###################
# Create logfile & log all outputs
# there are three logfile types to be found in /log.
# should be self explanatory.
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
sys.stderr = open(logfileErrors, "w")
sys.stdout = open(logfilen, "w")
## Create List of time-period-slices
###################
# Create List of time-period-slices
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
# Print slices
print("Time-period-slices:")
@ -154,19 +169,22 @@ for slice in time_slices:
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
print("---")
## Keywords
###################
# Keywords
# read keywords from a file and write to list.
keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe("data/keywords-raw.txt", "data/keywords.txt")
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
# Read the keywords from a file
with open("data/keywords.txt", "r") as file:
with open(f"{di}keywords.txt", "r") as file:
lines = file.readlines()
for line in lines:
keyword = line.strip() # Remove the newline character
keywords.append(keyword)
print("---")
## Senator Accounts
###################
# Senator Accounts
# Get accounts & alt-accounts from Senators-Datafile
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
@ -181,52 +199,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
print("\n")
print(f"\n{i} accounts in total.\n---")
## Scraping
###################
# Scraping
# report time:
timeStartScrape = datetime.now()
print("Starting scraping at:")
print(timeStartScrape.strftime(fTimeFormat))
print("---")
# Iterate over each Twitter account using multiprocessing
# with concurrent.futures.ProcessPoolExecutor() as executor:
# # List to store the scraping tasks
# tasks = []
# for handle in accounts:
# # Iterate over each time slice
# for slice_data in time_slices:
# # ... Code to prepare the slice_data ...
# # Schedule the scraping task
# task = executor.submit(
# scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
# )
# # Store the handle and slice_data as attributes of the task
# # Wait for all tasks to complete
# concurrent.futures.wait(tasks)
with concurrent.futures.ProcessPoolExecutor() as executor:
# List to store the scraping tasks
tasks = []
for handle in accounts:
# Iterate over each time slice
for slice_data in time_slices:
# ... Code to prepare the slice_data ...
# Schedule the scraping task
task = executor.submit(
scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix']
)
# Store the handle and slice_data as attributes of the task
# Wait for all tasks to complete
concurrent.futures.wait(tasks)
# report time:
timeEndScrape = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndScrape.strftime(fTimeFormat))
## Merge CSV-Files to file_alltweets.
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
os.chdir(path_to_tweetdfs)
# At first check, whether all slices are present.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
###################
# Merge CSV-Files to file_alltweets.
# fastest way is to save the slices seperately and then add every file to the
# output instead of using pandas or anything else.
os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
## At first check, whether all slices are present.
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") ???
# Create list of all files that should be in the folder:
AllFilesList = []
for handle in accounts:
for tslice in time_slices:
suffix = tslice['suffix']
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
# report missing files to "log_*_missing.txt"
with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
for file in AllFilesList:
if file not in tweetfiles:
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
else:
fout.write('all slices scraped.')
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
## Merge .csv files.
# check if file_alltweets (previously scraped tweets that have been merged
# into one file) exists in tweetfiles list, if it exists, remove from list
# to not include it in the following merge
if file_alltweets in tweetfiles:
tweetfiles.remove(file_alltweets)
# Go through all csv files and merge them into file_alltweets
@ -240,21 +267,24 @@ if tweetfiles:
with open(file, "rb") as f:
next(f) # skip the header
fout.write(f.read())
os.chdir(wd)
os.chdir(wd) # go back to wd
# Report timing info.
timeEndMerge = datetime.now()
print("---")
print("End of scraping at:")
print(timeEndMerge.strftime(fTimeFormat))
print("---")
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
# calulate times:
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
print(
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
)
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
# close connection to logfiles.
sys.stdout.close()
sys.stderr.close()

View File

@ -1,140 +0,0 @@
Coronavirus
Koronavirus
Corona
CDC
Wuhancoronavirus
Wuhanlockdown
Ncov
Wuhan
N95
Kungflu
Epidemic
outbreak
Sinophobia
China
covid-19
corona virus
covid
covid19
sars-cov-2
COVIDー19
COVD
pandemic
coronapocalypse
canceleverything
Coronials
SocialDistancingNow
Social Distancing
SocialDistancing
panicbuy
panic buy
panicbuying
panic buying
14DayQuarantine
DuringMy14DayQuarantine
panic shop
panic shopping
panicshop
InMyQuarantineSurvivalKit
panic-buy
panic-shop
coronakindness
quarantinelife
chinese virus
chinesevirus
stayhomechallenge
stay home challenge
sflockdown
DontBeASpreader
lockdown
lock down
shelteringinplace
sheltering in place
staysafestayhome
stay safe stay home
trumppandemic
trump pandemic
flattenthecurve
flatten the curve
china virus
chinavirus
quarentinelife
PPEshortage
saferathome
stayathome
stay at home
stay home
stayhome
GetMePPE
covidiot
epitwitter
pandemie
wear a mask
wearamask
kung flu
covididiot
COVID__19
omicron
variant
vaccine
travel ban
corona
coronavirus
sarscov2
sars cov2
sars cov 2
covid_19
ncov
ncov2019
2019-ncov
pandemic 2019ncov
2019ncov
quarantine
flattening the curve
flatteningthecurve
flattenthecurve
hand sanitizer
handsanitizer
social distancing
socialdistancing
work from home
workfromhome
working from home
workingfromhome
ppe
n95
covidiots
herd immunity
herdimmunity
pneumonia
wuhan virus
wuhanvirus
kungflu
vaccines
corona vaccine
corona vaccines
coronavaccine
coronavaccines
face shield
faceshield
face shields
faceshields
health worker
healthworker
health workers
healthworkers
stayhomestaysafe
coronaupdate
frontlineheroes
coronawarriors
homeschool
homeschooling
hometasking
masks4all
wfh
wash ur hands
wash your hands
washurhands
washyourhands
selfisolating
self isolating

View File

@ -1,24 +0,0 @@
/ALL-SENATORS-LONG-LONG.csv
/ALL-SENATORS.csv
/CoryGardner-LONG.csv
/CoryGardner.csv
/DavidPerdueGA-LONG.csv
/DavidPerdueGA.csv
/DougJones-LONG.csv
/DougJones.csv
/KLoeffler-LONG.csv
/KLoeffler.csv
/MarthaMcSallyAZ-LONG.csv
/MarthaMcSallyAZ.csv
/SenAlexander-LONG.csv
/SenAlexander.csv
/SenPatRoberts-LONG.csv
/SenPatRoberts.csv
/SenatorEnzi-LONG.csv
/SenatorEnzi.csv
/SenatorIsakson-LONG.csv
/SenatorIsakson.csv
/SenatorTomUdall-LONG.csv
/SenatorTomUdall.csv
/VP-LONG.csv
/VP.csv

View File

@ -3,13 +3,22 @@ import time
import pandas as pd
import snscrape.modules.twitter as sntwitter
def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix, maxTweets = 5000):
"""Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
Args:
handle (str): twitter handle of account to be scraped
keywords (list): list of strings containing the keywords that the tweets shall be searched for
td (str): tweet file output path
tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
"""
i = 0
currentTime = datetime.now()
ts_beg = slice_data['beg_time']
ts_end = slice_data['end_time']
suffix = slice_data['suffix']
tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
# create empty tweetlist that will be filled with tweets of current sen

View File

@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023
# create slices
def get_Tslices(ts_beg, ts_end, no_slices):
"""Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
Args:
ts_beg (datetime): Datetime start of overall period to be sliced.
ts_end (datetime): Datetime end of overall period to be sliced.
no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
Returns:
list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
"""
from datetime import datetime
from datetime import timedelta
ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):
# For log time conversions (seconds to days, hours, minutes)
def convertTime(duration):
"""Converts seconds to hours, minutes and seconds.
Args:
duration (int): seconds
Returns:
int: hours
int: minutes
int: seconds
"""
days, seconds = duration.days, duration.seconds
hours = days * 24 + seconds // 3600
minutes = (seconds % 3600) // 60