diff --git a/collect.py b/collect.py index a8e7db9..57466b8 100644 --- a/collect.py +++ b/collect.py @@ -4,9 +4,12 @@ Created on Thu Jun 8 01:08:21 2023 @author: Michael +collect.py scrapes tweets from senators of the us that were in office between +2020 and the beginning of 2023. + # https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html -Following files are necessary: +# Following files are necessary: funs/TimeSlice.py Function get_Tslices slices the defined timespan in config.py into N slices. Is necessary due to possible blocking of requests by twitter. @@ -17,26 +20,32 @@ Following files are necessary: "keywords.txt". funs/Scrape.py scrapes using snscrape.modules.twitter. See docstring. - data/keywords-raw.txt + data/IN/keywords-raw.txt Contains all keywords that are used to detect whether a tweet contains information about Covid19. - data/senators-raw.csv + data/IN/senators-raw.csv Contains the senator dataset converted to csv. Is used to get the account-names of all senators twitter accounts. -Requirements: +# Requirements: - snscrape 0.6.2.20230321+ - pandas 2.0+ -The script will first import needed libraries. +# IMPORTANT: This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is included in 'snscrape/' as a git repository for better reproducibility. Earlier versions of snscrape will most likely fail to scrape all tweets because of certain rate limits or other errors that may occur. -config.py will check whether snscrape is already installed. If not, it will try -to install the included version automatically. +Install snscrape from local git repo to make shure that it fits the used version. +If snscrape is shall be installed from local repo, uncomment the following lines: -How to use: -- To run the script, first adjust the config.py file. +import subprocess +os.chdir('snscrape/') +subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) +os.chdir(wd) + + +# How to use: +- To run the script, first adjust the options found in the following lines. - config.py will check whether snscrape is already installed. If not, it will try to install the included version automatically. - run the script @@ -97,16 +106,6 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S" # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 -################### -# Install snscrape from local git repo to make shure that it fits the used version. -# If snscrape is already installed, uncomment the following lines: -""" -import subprocess -os.chdir('snscrape/') -subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) -os.chdir(wd) -""" - # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html # get subparams just like in user where user id can be obtained by user.id @@ -146,6 +145,10 @@ tweetDFColumns = [ "source", ] +############################################################################# +################## do NOT change anything below this line ################### +############################################################################# + ## Import functions from funs.TimeSlice import * from funs.ClearDupes import deDupe @@ -186,8 +189,8 @@ print("---") ################### # Senator Accounts # Get accounts & alt-accounts from Senators-Datafile -accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() -alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() +accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist() +alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist() alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields accounts.extend(alt_accounts)