corrects comments
This commit is contained in:
		
							
								
								
									
										45
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										45
									
								
								collect.py
									
									
									
									
									
								
							| @@ -4,9 +4,12 @@ Created on Thu Jun  8 01:08:21 2023 | ||||
|  | ||||
| @author: Michael | ||||
|  | ||||
| collect.py scrapes tweets from senators of the us that were in office between  | ||||
| 2020 and the beginning of 2023. | ||||
|  | ||||
| # https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html | ||||
|  | ||||
| Following files are necessary: | ||||
| # Following files are necessary: | ||||
|     funs/TimeSlice.py | ||||
|         Function get_Tslices slices the defined timespan in config.py into N  | ||||
|         slices. Is necessary due to possible blocking of requests by twitter.  | ||||
| @@ -17,26 +20,32 @@ Following files are necessary: | ||||
|         "keywords.txt". | ||||
|     funs/Scrape.py | ||||
|         scrapes using snscrape.modules.twitter. See docstring. | ||||
|     data/keywords-raw.txt | ||||
|     data/IN/keywords-raw.txt | ||||
|         Contains all keywords that are used to detect whether a tweet contains | ||||
|         information about Covid19. | ||||
|     data/senators-raw.csv | ||||
|     data/IN/senators-raw.csv | ||||
|         Contains the senator dataset converted to csv. Is used to get the  | ||||
|         account-names of all senators twitter accounts. | ||||
|  | ||||
| Requirements: | ||||
| # Requirements: | ||||
|     - snscrape 0.6.2.20230321+ | ||||
|     - pandas 2.0+ | ||||
| The script will first import needed libraries.  | ||||
| # IMPORTANT: | ||||
| This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is  | ||||
| included in 'snscrape/' as a git repository for better reproducibility. Earlier | ||||
| versions of snscrape will most likely fail to scrape all tweets because of  | ||||
| certain rate limits or other errors that may occur. | ||||
| config.py will check whether snscrape is already installed. If not, it will try | ||||
| to install the included version automatically. | ||||
| Install snscrape from local git repo to make shure that it fits the used version. | ||||
| If snscrape is shall be installed from local repo, uncomment the following lines: | ||||
|  | ||||
| How to use: | ||||
| - To run the script, first adjust the config.py file.  | ||||
| import subprocess | ||||
| os.chdir('snscrape/') | ||||
| subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) | ||||
| os.chdir(wd)  | ||||
|  | ||||
|   | ||||
| # How to use: | ||||
| - To run the script, first adjust the options found in the following lines. | ||||
| - config.py will check whether snscrape is already installed. If not, it will try | ||||
| to install the included version automatically.  | ||||
| - run the script | ||||
| @@ -97,16 +106,6 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S" | ||||
| # Maximum tweets to be scraped by snscrape. Can be left untouched. | ||||
| maxTweets = 5000 | ||||
|  | ||||
| ################### | ||||
| # Install snscrape from local git repo to make shure that it fits the used version. | ||||
| # If snscrape is already installed, uncomment the following lines: | ||||
| """  | ||||
| import subprocess | ||||
| os.chdir('snscrape/') | ||||
| subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) | ||||
| os.chdir(wd)  | ||||
| """ | ||||
|  | ||||
| # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: | ||||
| # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html | ||||
| #   get subparams just like in user where user id can be obtained by user.id  | ||||
| @@ -146,6 +145,10 @@ tweetDFColumns = [ | ||||
|     "source", | ||||
| ] | ||||
|  | ||||
| ############################################################################# | ||||
| ################## do NOT change anything below this line ################### | ||||
| ############################################################################# | ||||
|  | ||||
| ## Import functions | ||||
| from funs.TimeSlice import * | ||||
| from funs.ClearDupes import deDupe | ||||
| @@ -186,8 +189,8 @@ print("---") | ||||
| ################### | ||||
| # Senator Accounts | ||||
| # Get accounts & alt-accounts from Senators-Datafile | ||||
| accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() | ||||
| alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() | ||||
| accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist() | ||||
| alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist() | ||||
| alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields | ||||
| accounts.extend(alt_accounts) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck