From 1a19fd407aaa6c613341293ece43e9ca6af15fd6 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 Jun 2023 16:54:57 +0200 Subject: [PATCH] adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully. --- collect.py | 164 ++++++++++++++++++++++++++++------------------------- 1 file changed, 87 insertions(+), 77 deletions(-) diff --git a/collect.py b/collect.py index ae030b1..0f70226 100644 --- a/collect.py +++ b/collect.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -''' +""" Created on Thu Jun 8 01:08:21 2023 @author: Michael @@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' which is the final output. -''' +""" import os import pandas as pd @@ -62,77 +62,76 @@ import concurrent.futures ## Setup directories # WD Michael -wd = '/home/michael/Documents/PS/Data/collectTweets/' +wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # Tweet-datafile output directory -td = 'data/tweets/' +td = "data/tweets/" # Name of file that all tweets will be written to -file_alltweets = 'ALL-SENATORS-TWEETS.csv' +file_alltweets = "ALL-SENATORS-TWEETS.csv" path_to_tweetdfs = wd + td -## Define Timespan +## Define Timespan # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) -ts_beg = '2020-01-01T00:00:00Z' # start of scraping -ts_end = '2023-01-03T00:00:00Z' # end of straping -no_slices = 24 # Number of slices / time periods. +ts_beg = "2020-01-01T00:00:00Z" # start of scraping +ts_end = "2023-01-03T00:00:00Z" # end of straping +no_slices = 24 # Number of slices / time periods. # Maximum tweets to be scraped by snscrape. Can be left untouched. maxTweets = 5000 # Name of logfile -logfile = 'log/log_' +logfile = "log/log_" ## Install snscrape from local git repo to make shure that it fits the used version. # If snscrape is already installed, uncomment the following lines: -''' +""" import subprocess os.chdir('snscrape/') subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) os.chdir(wd) -''' +""" # Columns for tweet dataframe tweetDFColumns = [ - 'id', - 'user.id', - 'user.username', - 'user.verified', - 'user.created', - 'user.favouritesCount', - 'user.followersCount', - 'user.friendsCount', - 'user.url', - 'rawContent', - 'renderedContent', - 'cashtags', - 'coordinates', - 'hashtags', - 'inReplyToTweetId', - 'inReplyToUser', - 'media', - 'mentionedUsers', - 'links', - 'place', - 'quotedTweet', - 'retweetedTweet', - 'sourceLabel', - 'sourceUrl', - 'url', - 'date', - 'replyCount', - 'retweetCount', - 'likeCount', - 'quoteCount', - 'conversationId', - 'lang', - 'source'] - -## + "id", + "user.id", + "user.username", + "user.verified", + "user.created", + "user.favouritesCount", + "user.followersCount", + "user.friendsCount", + "user.url", + "rawContent", + "renderedContent", + "cashtags", + "coordinates", + "hashtags", + "inReplyToTweetId", + "inReplyToUser", + "media", + "mentionedUsers", + "links", + "place", + "quotedTweet", + "retweetedTweet", + "sourceLabel", + "sourceUrl", + "url", + "date", + "replyCount", + "retweetCount", + "likeCount", + "quoteCount", + "conversationId", + "lang", + "source", +] ## Import other files from funs.TimeSlice import * @@ -140,99 +139,110 @@ from funs.ClearDupes import deDupe from funs.Scrape import scrapeTweets # create logfile & log all outputs -logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt' -logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt' -sys.stderr = open(logfileErrors, 'w') -sys.stdout = open(logfilen, 'w') +logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt" +logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt" +sys.stderr = open(logfileErrors, "w") +sys.stdout = open(logfilen, "w") ## Create List of time-period-slices time_slices = get_Tslices(ts_beg, ts_end, no_slices) # Print slices -print('Time-period-slices:') +print("Time-period-slices:") for slice in time_slices: - print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time']) -print('---') + print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"]) +print("---") ## Keywords keywords = [] # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' -deDupe('data/keywords-raw.txt', 'data/keywords.txt') +deDupe("data/keywords-raw.txt", "data/keywords.txt") # Read the keywords from a file -with open('data/keywords.txt', 'r') as file: +with open("data/keywords.txt", "r") as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) -print('---') +print("---") ## Senator Accounts # Get accounts & alt-accounts from Senators-Datafile -accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() -alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() -print('Accounts to be scraped:') -print(accounts) -print(alt_accounts) -print('---') +accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() +alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() +alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields +accounts.append(alt_accounts) +# Print accounts to be scraped +print("Accounts to be scraped:") +for i, acc in enumerate(accounts): # print 5 accounts per line + print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars + if i % 5 == 4: + print "\n" +print("\n---") ## Scraping timeStartScrape = datetime.now() print("Starting scraping at:") -print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S')) -print('---') +print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S")) +print("---") # Iterate over each Twitter account using multiprocessing with concurrent.futures.ThreadPoolExecutor() as executor: # List to store the scraping tasks tasks = [] - + for handle in accounts: # Iterate over each time slice for slice_data in time_slices: # ... code to prepare the slice_data ... - + # Schedule the scraping task task = executor.submit(scrapeTweets, handle, slice_data, keywords, td) tasks.append(task) - + # Wait for all tasks to complete concurrent.futures.wait(tasks) timeEndScrape = datetime.now() print("---") print("End of scraping at:") -print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S')) +print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S")) -## Merge CSV-Files to file_alltweets +## Merge CSV-Files to file_alltweets. # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. os.chdir(path_to_tweetdfs) -tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder +# At first check, whether all slices are present. +tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv") +for handle +for tweetfile in tweetfiles: + # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge if file_alltweets in tweetfiles: tweetfiles.remove(file_alltweets) # Go through all csv files and merge them into file_alltweets -with open(file_alltweets,"wb") as fout: +with open(file_alltweets, "wb") as fout: # first file (because of the header): with open(tweetfiles[0], "rb") as f: fout.write(f.read()) - # other files without the header: + # other files without the header: for file in tweetfiles[1:]: with open(file, "rb") as f: - next(f) # skip the header + next(f) # skip the header fout.write(f.read()) os.chdir(wd) timeEndMerge = datetime.now() print("---") print("End of scraping at:") -print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S')) +print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S")) print("---") tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) -print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds") +print( + f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds" +) print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds") print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds") sys.stdout.close() -sys.stderr.close() \ No newline at end of file +sys.stderr.close()