adds docstrings to functions. adds several comments.
This commit is contained in:
		
							
								
								
									
										107
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										107
									
								
								collect.py
									
									
									
									
									
								
							| @@ -4,6 +4,8 @@ Created on Thu Jun  8 01:08:21 2023 | ||||
|  | ||||
| @author: Michael | ||||
|  | ||||
| # https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html | ||||
|  | ||||
| Following files are necessary: | ||||
|     config.py | ||||
|         Used to configure everything that's needed for this script. | ||||
| @@ -60,7 +62,8 @@ import sys | ||||
| from datetime import datetime | ||||
| import concurrent.futures | ||||
|  | ||||
| ## Setup directories | ||||
| ################### | ||||
| # Setup directories | ||||
| # WD Michael | ||||
| wd = "/home/michael/Documents/PS/Data/collectTweets/" | ||||
| # WD Server | ||||
| @@ -72,9 +75,14 @@ td = "data/tweets/" | ||||
| # Name of file that all tweets will be written to | ||||
| file_alltweets = "ALL-SENATORS-TWEETS.csv" | ||||
|  | ||||
| # don't change this one | ||||
| path_to_tweetdfs = wd + td | ||||
|  | ||||
| ## Define Timespan | ||||
| # Name of logfile | ||||
| logfile = wd+"log/log_" | ||||
|  | ||||
| ################### | ||||
| # Define Timespan & time-format | ||||
| # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) | ||||
| ts_beg = "2020-01-01T00:00:00Z"  # start of scraping | ||||
| ts_end = "2023-01-03T00:00:00Z"  # end of straping | ||||
| @@ -86,10 +94,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S" | ||||
| # Maximum tweets to be scraped by snscrape. Can be left untouched. | ||||
| maxTweets = 5000 | ||||
|  | ||||
| # Name of logfile | ||||
| logfile = wd+"log/log_" | ||||
|  | ||||
| ## Install snscrape from local git repo to make shure that it fits the used version. | ||||
| ################### | ||||
| # Install snscrape from local git repo to make shure that it fits the used version. | ||||
| # If snscrape is already installed, uncomment the following lines: | ||||
| """  | ||||
| import subprocess | ||||
| @@ -98,7 +104,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) | ||||
| os.chdir(wd)  | ||||
| """ | ||||
|  | ||||
| # Columns for tweet dataframe | ||||
| # Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet: | ||||
| # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html | ||||
| #   get subparams just like in user where user id can be obtained by user.id  | ||||
| tweetDFColumns = [ | ||||
|     "id", | ||||
|     "user.id", | ||||
| @@ -135,18 +143,22 @@ tweetDFColumns = [ | ||||
|     "source", | ||||
| ] | ||||
|  | ||||
| ## Import other files | ||||
| ## Import functions | ||||
| from funs.TimeSlice import * | ||||
| from funs.ClearDupes import deDupe | ||||
| from funs.Scrape import scrapeTweets | ||||
|  | ||||
| # create logfile & log all outputs | ||||
| ###################  | ||||
| # Create logfile & log all outputs | ||||
| #   there are three logfile types to be found in /log. | ||||
| #   should be self explanatory. | ||||
| logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" | ||||
| logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" | ||||
| sys.stderr = open(logfileErrors, "w") | ||||
| sys.stdout = open(logfilen, "w") | ||||
|  | ||||
| ## Create List of time-period-slices | ||||
| ################### | ||||
| # Create List of time-period-slices | ||||
| time_slices = get_Tslices(ts_beg, ts_end, no_slices) | ||||
| # Print slices | ||||
| print("Time-period-slices:") | ||||
| @@ -154,7 +166,9 @@ for slice in time_slices: | ||||
|     print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"]) | ||||
| print("---") | ||||
|  | ||||
| ## Keywords | ||||
| ################### | ||||
| # Keywords | ||||
| # read keywords from a file and write to list. | ||||
| keywords = [] | ||||
| # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' | ||||
| deDupe("data/keywords-raw.txt", "data/keywords.txt") | ||||
| @@ -166,7 +180,8 @@ with open("data/keywords.txt", "r") as file: | ||||
|         keywords.append(keyword) | ||||
| print("---") | ||||
|  | ||||
| ## Senator Accounts | ||||
| ################### | ||||
| # Senator Accounts | ||||
| # Get accounts & alt-accounts from Senators-Datafile | ||||
| accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() | ||||
| alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() | ||||
| @@ -181,43 +196,50 @@ for i, acc in enumerate(accounts): # print 5 accounts per line | ||||
|         print("\n") | ||||
| print(f"\n{i} accounts in total.\n---") | ||||
|  | ||||
| ## Scraping | ||||
| ################### | ||||
| # Scraping | ||||
| # report time: | ||||
| timeStartScrape = datetime.now() | ||||
| print("Starting scraping at:") | ||||
| print(timeStartScrape.strftime(fTimeFormat)) | ||||
| print("---") | ||||
|  | ||||
| # Iterate over each Twitter account using multiprocessing | ||||
| # with concurrent.futures.ProcessPoolExecutor() as executor: | ||||
| #     # List to store the scraping tasks | ||||
| #     tasks = [] | ||||
| #     for handle in accounts: | ||||
| #         # Iterate over each time slice | ||||
| #         for slice_data in time_slices: | ||||
| #             # ... Code to prepare the slice_data ... | ||||
| #             # Schedule the scraping task | ||||
| #             task = executor.submit( | ||||
| #                 scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns | ||||
| #             ) | ||||
| #             # Store the handle and slice_data as attributes of the task | ||||
| #     # Wait for all tasks to complete | ||||
| #     concurrent.futures.wait(tasks) | ||||
|      | ||||
| with concurrent.futures.ProcessPoolExecutor() as executor: | ||||
|     # List to store the scraping tasks | ||||
|     tasks = [] | ||||
|     for handle in accounts: | ||||
|         # Iterate over each time slice | ||||
|         for slice_data in time_slices: | ||||
|             # ... Code to prepare the slice_data ... | ||||
|             # Schedule the scraping task | ||||
|             task = executor.submit( | ||||
|                 scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix']  | ||||
|             ) | ||||
|             # Store the handle and slice_data as attributes of the task | ||||
|     # Wait for all tasks to complete | ||||
|     concurrent.futures.wait(tasks) | ||||
|  | ||||
| # report time: | ||||
| timeEndScrape = datetime.now() | ||||
| print("---") | ||||
| print("End of scraping at:") | ||||
| print(timeEndScrape.strftime(fTimeFormat)) | ||||
|  | ||||
| ## Merge CSV-Files to file_alltweets. | ||||
| # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. | ||||
| os.chdir(path_to_tweetdfs) | ||||
| # At first check, whether all slices are present. | ||||
| tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") | ||||
| ################### | ||||
| # Merge CSV-Files to file_alltweets. | ||||
| # fastest way is to save the slices seperately and then add every file to the  | ||||
| # output instead of using pandas or anything else. | ||||
| os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir | ||||
| ## At first check, whether all slices are present. | ||||
| tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") ??? | ||||
| # Create list of all files that should be in the folder: | ||||
| AllFilesList = [] | ||||
| for handle in accounts: | ||||
|     for tslice in time_slices: | ||||
|         suffix = tslice['suffix'] | ||||
|         AllFilesList.append(f"Tweets-{handle}{suffix}.csv") | ||||
|         AllFilesList.append(f"Tweets-{handle}{suffix}.csv")  | ||||
| # report missing files to "log_*_missing.txt" | ||||
| with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout: | ||||
|     for file in AllFilesList: | ||||
|         if file not in tweetfiles: | ||||
| @@ -225,8 +247,10 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w" | ||||
|         else: | ||||
|             fout.write('all slices scraped.') | ||||
|              | ||||
|  | ||||
| # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge | ||||
| ## Merge .csv files. | ||||
| # check if file_alltweets (previously scraped tweets that have been merged  | ||||
| # into one file) exists in tweetfiles list, if it exists, remove from list  | ||||
| # to not include it in the following merge | ||||
| if file_alltweets in tweetfiles: | ||||
|     tweetfiles.remove(file_alltweets) | ||||
| # Go through all csv files and merge them into file_alltweets | ||||
| @@ -240,21 +264,24 @@ if tweetfiles: | ||||
|             with open(file, "rb") as f: | ||||
|                 next(f)  # skip the header | ||||
|                 fout.write(f.read()) | ||||
| os.chdir(wd) | ||||
| os.chdir(wd) # go back to wd | ||||
|  | ||||
| # Report timing info. | ||||
| timeEndMerge = datetime.now() | ||||
| print("---") | ||||
| print("End of scraping at:") | ||||
| print(timeEndMerge.strftime(fTimeFormat)) | ||||
| print("---") | ||||
| tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) | ||||
| tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) | ||||
| tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) | ||||
| # calulate times: | ||||
| tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time | ||||
| tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time | ||||
| tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time | ||||
| print( | ||||
|     f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" | ||||
| ) | ||||
| print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") | ||||
| print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") | ||||
|  | ||||
| # close connection to logfiles. | ||||
| sys.stdout.close() | ||||
| sys.stderr.close() | ||||
|   | ||||
| @@ -3,13 +3,22 @@ import time | ||||
| import pandas as pd | ||||
| import snscrape.modules.twitter as sntwitter | ||||
|  | ||||
| def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): | ||||
| def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,  maxTweets = 5000): | ||||
|     """Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter. | ||||
|  | ||||
|     Args: | ||||
|         handle (str): twitter handle of account to be scraped | ||||
|         keywords (list): list of strings containing the keywords that the tweets shall be searched for | ||||
|         td (str): tweet file output path | ||||
|         tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet | ||||
|         ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) | ||||
|         ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) | ||||
|         suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv" | ||||
|         maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000. | ||||
|     """ | ||||
|     i = 0 | ||||
|      | ||||
|     currentTime = datetime.now() | ||||
|     ts_beg = slice_data['beg_time'] | ||||
|     ts_end = slice_data['end_time'] | ||||
|     suffix = slice_data['suffix'] | ||||
|     tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv" | ||||
|      | ||||
|     # create empty tweetlist that will be filled with tweets of current sen | ||||
|   | ||||
| @@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023 | ||||
|  | ||||
| # create slices  | ||||
| def get_Tslices(ts_beg, ts_end, no_slices): | ||||
|     """Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period. | ||||
|  | ||||
|     Args: | ||||
|         ts_beg (datetime): Datetime start of overall period to be sliced. | ||||
|         ts_end (datetime): Datetime end of overall period to be sliced. | ||||
|         no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each. | ||||
|  | ||||
|     Returns: | ||||
|         list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1) | ||||
|     """ | ||||
|     from datetime import datetime | ||||
|     from datetime import timedelta | ||||
|     ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ') | ||||
| @@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices): | ||||
|  | ||||
| # For log time conversions (seconds to days, hours, minutes) | ||||
| def convertTime(duration): | ||||
|     """Converts seconds to hours, minutes and seconds. | ||||
|  | ||||
|     Args: | ||||
|         duration (int): seconds | ||||
|  | ||||
|     Returns: | ||||
|         int: hours | ||||
|         int: minutes | ||||
|         int: seconds | ||||
|     """ | ||||
|     days, seconds = duration.days, duration.seconds | ||||
|     hours = days * 24 + seconds // 3600 | ||||
|     minutes = (seconds % 3600) // 60 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck