adds filechecks
This commit is contained in:
		
							
								
								
									
										51
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										51
									
								
								collect.py
									
									
									
									
									
								
							| @@ -80,12 +80,14 @@ ts_beg = "2020-01-01T00:00:00Z"  # start of scraping | ||||
| ts_end = "2023-01-03T00:00:00Z"  # end of straping | ||||
| no_slices = 24  # Number of slices / time periods. | ||||
|  | ||||
| # file time format | ||||
| fTimeFormat = "%Y-%m-%d_%H-%M-%S" | ||||
|  | ||||
| # Maximum tweets to be scraped by snscrape. Can be left untouched. | ||||
| maxTweets = 5000 | ||||
|  | ||||
| # Name of logfile | ||||
| logfile = "log/log_" | ||||
|  | ||||
| logfile = wd+"log/log_" | ||||
|  | ||||
| ## Install snscrape from local git repo to make shure that it fits the used version. | ||||
| # If snscrape is already installed, uncomment the following lines: | ||||
| @@ -139,8 +141,8 @@ from funs.ClearDupes import deDupe | ||||
| from funs.Scrape import scrapeTweets | ||||
|  | ||||
| # create logfile & log all outputs | ||||
| logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt" | ||||
| logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt" | ||||
| logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt" | ||||
| logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt" | ||||
| sys.stderr = open(logfileErrors, "w") | ||||
| sys.stdout = open(logfilen, "w") | ||||
|  | ||||
| @@ -169,19 +171,20 @@ print("---") | ||||
| accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() | ||||
| alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() | ||||
| alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields | ||||
| accounts.append(alt_accounts) | ||||
| accounts.extend(alt_accounts) | ||||
|  | ||||
| # Print accounts to be scraped | ||||
| print("Accounts to be scraped:") | ||||
| for i, acc in enumerate(accounts): # print 5 accounts per line | ||||
|     print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars | ||||
|     if i % 5 == 4:  | ||||
|         print "\n" | ||||
| print("\n---") | ||||
|         print("\n") | ||||
| print(f"\n{i} accounts in total.\n---") | ||||
|  | ||||
| ## Scraping | ||||
| timeStartScrape = datetime.now() | ||||
| print("Starting scraping at:") | ||||
| print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S")) | ||||
| print(timeStartScrape.strftime(fTimeFormat)) | ||||
| print("---") | ||||
|  | ||||
| # Iterate over each Twitter account using multiprocessing | ||||
| @@ -195,7 +198,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor: | ||||
|             # ... code to prepare the slice_data ... | ||||
|  | ||||
|             # Schedule the scraping task | ||||
|             task = executor.submit(scrapeTweets, handle, slice_data, keywords, td) | ||||
|             task = executor.submit(scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns) | ||||
|             tasks.append(task) | ||||
|  | ||||
|     # Wait for all tasks to complete | ||||
| @@ -204,7 +207,7 @@ with concurrent.futures.ThreadPoolExecutor() as executor: | ||||
| timeEndScrape = datetime.now() | ||||
| print("---") | ||||
| print("End of scraping at:") | ||||
| print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S")) | ||||
| print(timeEndScrape.strftime(fTimeFormat)) | ||||
|  | ||||
| ## Merge CSV-Files to file_alltweets. | ||||
| # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. | ||||
| @@ -213,31 +216,33 @@ os.chdir(path_to_tweetdfs) | ||||
| tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") | ||||
| AllFilesList = [] | ||||
| for handle in accounts: | ||||
|     for suffix in time_slices: | ||||
|         AllFilesList.append("Tweets-{handle}{suffix}.csv") | ||||
| with open(f"{logfile}missing-{timeStartScrape}") as fout: | ||||
|     for tslice in time_slices: | ||||
|         suffix = tslice['suffix'] | ||||
|         AllFilesList.append(f"Tweets-{handle}{suffix}.csv") | ||||
| with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout: | ||||
|     for file in AllFilesList: | ||||
|         if file not in tweetfiles: | ||||
|             fout.write('Missing: {file}.\n') # if file is not in tweetfiles, print error message. | ||||
|             fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message. | ||||
| # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge | ||||
| if file_alltweets in tweetfiles: | ||||
|     tweetfiles.remove(file_alltweets) | ||||
| # Go through all csv files and merge them into file_alltweets | ||||
| with open(file_alltweets, "wb") as fout: | ||||
|     # first file (because of the header): | ||||
|     with open(tweetfiles[0], "rb") as f: | ||||
|         fout.write(f.read()) | ||||
|     # other files without the header: | ||||
|     for file in tweetfiles[1:]: | ||||
|         with open(file, "rb") as f: | ||||
|             next(f)  # skip the header | ||||
| if len(tweetfiles) > 0: | ||||
|     with open(file_alltweets, "wb") as fout: | ||||
|         # first file (because of the header): | ||||
|         with open(tweetfiles[0], "rb") as f: | ||||
|             fout.write(f.read()) | ||||
|         # other files without the header: | ||||
|         for file in tweetfiles[1:]: | ||||
|             with open(file, "rb") as f: | ||||
|                 next(f)  # skip the header | ||||
|                 fout.write(f.read()) | ||||
| os.chdir(wd) | ||||
|  | ||||
| timeEndMerge = datetime.now() | ||||
| print("---") | ||||
| print("End of scraping at:") | ||||
| print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S")) | ||||
| print(timeEndMerge.strftime(fTimeFormat)) | ||||
| print("---") | ||||
| tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) | ||||
| tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000): | ||||
|     from datetime import datetime | ||||
|     currentTime = datetime.now | ||||
| def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000): | ||||
|     from datetime import datetime, time | ||||
|     import pandas as pd | ||||
|     import snscrape.modules.twitter as sntwitter | ||||
|      | ||||
|     currentTime = datetime.now | ||||
|     ts_beg = slice_data['beg_time'] | ||||
|     ts_end = slice_data['end_time'] | ||||
|     suffix = slice_data['suffix'] | ||||
| @@ -41,4 +43,4 @@ def scrapeTweets(handle, slice_data, keywords, td, maxTweets = 5000): | ||||
|     # save short csv | ||||
|     tweet_df.to_csv(csv_path, encoding='utf-8') | ||||
|     # sleep 1 second to not get blocked because of excessive requests | ||||
|     # time.sleep(0.5) | ||||
|     time.sleep(0.5) | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck