corrects comments
This commit is contained in:
parent
791cebc297
commit
340cca017c
45
collect.py
45
collect.py
@ -4,9 +4,12 @@ Created on Thu Jun 8 01:08:21 2023
|
|||||||
|
|
||||||
@author: Michael
|
@author: Michael
|
||||||
|
|
||||||
|
collect.py scrapes tweets from senators of the us that were in office between
|
||||||
|
2020 and the beginning of 2023.
|
||||||
|
|
||||||
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
|
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
|
||||||
|
|
||||||
Following files are necessary:
|
# Following files are necessary:
|
||||||
funs/TimeSlice.py
|
funs/TimeSlice.py
|
||||||
Function get_Tslices slices the defined timespan in config.py into N
|
Function get_Tslices slices the defined timespan in config.py into N
|
||||||
slices. Is necessary due to possible blocking of requests by twitter.
|
slices. Is necessary due to possible blocking of requests by twitter.
|
||||||
@ -17,26 +20,32 @@ Following files are necessary:
|
|||||||
"keywords.txt".
|
"keywords.txt".
|
||||||
funs/Scrape.py
|
funs/Scrape.py
|
||||||
scrapes using snscrape.modules.twitter. See docstring.
|
scrapes using snscrape.modules.twitter. See docstring.
|
||||||
data/keywords-raw.txt
|
data/IN/keywords-raw.txt
|
||||||
Contains all keywords that are used to detect whether a tweet contains
|
Contains all keywords that are used to detect whether a tweet contains
|
||||||
information about Covid19.
|
information about Covid19.
|
||||||
data/senators-raw.csv
|
data/IN/senators-raw.csv
|
||||||
Contains the senator dataset converted to csv. Is used to get the
|
Contains the senator dataset converted to csv. Is used to get the
|
||||||
account-names of all senators twitter accounts.
|
account-names of all senators twitter accounts.
|
||||||
|
|
||||||
Requirements:
|
# Requirements:
|
||||||
- snscrape 0.6.2.20230321+
|
- snscrape 0.6.2.20230321+
|
||||||
- pandas 2.0+
|
- pandas 2.0+
|
||||||
The script will first import needed libraries.
|
# IMPORTANT:
|
||||||
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||||
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||||
versions of snscrape will most likely fail to scrape all tweets because of
|
versions of snscrape will most likely fail to scrape all tweets because of
|
||||||
certain rate limits or other errors that may occur.
|
certain rate limits or other errors that may occur.
|
||||||
config.py will check whether snscrape is already installed. If not, it will try
|
Install snscrape from local git repo to make shure that it fits the used version.
|
||||||
to install the included version automatically.
|
If snscrape is shall be installed from local repo, uncomment the following lines:
|
||||||
|
|
||||||
How to use:
|
import subprocess
|
||||||
- To run the script, first adjust the config.py file.
|
os.chdir('snscrape/')
|
||||||
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||||
|
os.chdir(wd)
|
||||||
|
|
||||||
|
|
||||||
|
# How to use:
|
||||||
|
- To run the script, first adjust the options found in the following lines.
|
||||||
- config.py will check whether snscrape is already installed. If not, it will try
|
- config.py will check whether snscrape is already installed. If not, it will try
|
||||||
to install the included version automatically.
|
to install the included version automatically.
|
||||||
- run the script
|
- run the script
|
||||||
@ -97,16 +106,6 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
|||||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||||
maxTweets = 5000
|
maxTweets = 5000
|
||||||
|
|
||||||
###################
|
|
||||||
# Install snscrape from local git repo to make shure that it fits the used version.
|
|
||||||
# If snscrape is already installed, uncomment the following lines:
|
|
||||||
"""
|
|
||||||
import subprocess
|
|
||||||
os.chdir('snscrape/')
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
|
||||||
os.chdir(wd)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||||
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||||
# get subparams just like in user where user id can be obtained by user.id
|
# get subparams just like in user where user id can be obtained by user.id
|
||||||
@ -146,6 +145,10 @@ tweetDFColumns = [
|
|||||||
"source",
|
"source",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
################## do NOT change anything below this line ###################
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
## Import functions
|
## Import functions
|
||||||
from funs.TimeSlice import *
|
from funs.TimeSlice import *
|
||||||
from funs.ClearDupes import deDupe
|
from funs.ClearDupes import deDupe
|
||||||
@ -186,8 +189,8 @@ print("---")
|
|||||||
###################
|
###################
|
||||||
# Senator Accounts
|
# Senator Accounts
|
||||||
# Get accounts & alt-accounts from Senators-Datafile
|
# Get accounts & alt-accounts from Senators-Datafile
|
||||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
|
||||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
|
||||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||||
accounts.extend(alt_accounts)
|
accounts.extend(alt_accounts)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user