corrects comments

2023-06-23 20:59:14 +02:00
parent 791cebc297
commit 340cca017c
1 changed files with 24 additions and 21 deletions
--- a/collect.py
+++ b/collect.py
@@ -4,9 +4,12 @@ Created on Thu Jun  8 01:08:21 2023

@author: Michael

+collect.py scrapes tweets from senators of the us that were in office between 
+2020 and the beginning of 2023.
+
 # https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html

-Following files are necessary:
+# Following files are necessary:
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N 
        slices. Is necessary due to possible blocking of requests by twitter. 
@@ -17,26 +20,32 @@ Following files are necessary:
        "keywords.txt".
    funs/Scrape.py
        scrapes using snscrape.modules.twitter. See docstring.
-    data/keywords-raw.txt
+    data/IN/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
-    data/senators-raw.csv
+    data/IN/senators-raw.csv
        Contains the senator dataset converted to csv. Is used to get the 
        account-names of all senators twitter accounts.

-Requirements:
+# Requirements:
    - snscrape 0.6.2.20230321+
    - pandas 2.0+
-The script will first import needed libraries. 
+# IMPORTANT:
 This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
 included in 'snscrape/' as a git repository for better reproducibility. Earlier
 versions of snscrape will most likely fail to scrape all tweets because of 
 certain rate limits or other errors that may occur.
-config.py will check whether snscrape is already installed. If not, it will try
-to install the included version automatically.
+Install snscrape from local git repo to make shure that it fits the used version.
+If snscrape is shall be installed from local repo, uncomment the following lines:

-How to use:
- To run the script, first adjust the config.py file. 
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+
+ 
+# How to use:
+- To run the script, first adjust the options found in the following lines.
 - config.py will check whether snscrape is already installed. If not, it will try
 to install the included version automatically. 
 - run the script
@@ -97,16 +106,6 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000

-###################
-# Install snscrape from local git repo to make shure that it fits the used version.
-# If snscrape is already installed, uncomment the following lines:
-""" 
-import subprocess
-os.chdir('snscrape/')
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
-os.chdir(wd) 
-"""
-
 # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
 # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
 #   get subparams just like in user where user id can be obtained by user.id 
@@ -146,6 +145,10 @@ tweetDFColumns = [
    "source",
 ]

+#############################################################################
+################## do NOT change anything below this line ###################
+#############################################################################
+
 ## Import functions
 from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
@@ -186,8 +189,8 @@ print("---")
 ###################
 # Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
-accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
-alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
+accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
+alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
 alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
 accounts.extend(alt_accounts)