adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully.

2023-06-23 16:54:57 +02:00
parent 5d0c41407e
commit 1a19fd407a
1 changed files with 87 additions and 77 deletions
--- a/collect.py
+++ b/collect.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-'''
+"""
 Created on Thu Jun  8 01:08:21 2023
@author: Michael
@@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether
 a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
 in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
 which is the final output.
-'''
+"""
 import os
 import pandas as pd
@@ -62,77 +62,76 @@ import concurrent.futures
 ## Setup directories
 # WD Michael
-wd = '/home/michael/Documents/PS/Data/collectTweets/'
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # Tweet-datafile output directory
-td = 'data/tweets/'
+td = "data/tweets/"
 # Name of file that all tweets will be written to
-file_alltweets = 'ALL-SENATORS-TWEETS.csv'
+file_alltweets = "ALL-SENATORS-TWEETS.csv"
 path_to_tweetdfs = wd + td
-## Define Timespan 
+## Define Timespan
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
-ts_beg = '2020-01-01T00:00:00Z' # start of scraping
+ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
-ts_end = '2023-01-03T00:00:00Z' # end of straping
+ts_end = "2023-01-03T00:00:00Z"  # end of straping
-no_slices = 24 # Number of slices / time periods.
+no_slices = 24  # Number of slices / time periods.
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 # Name of logfile
-logfile = 'log/log_'
+logfile = "log/log_"
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
-''' 
+""" 
 import subprocess
 os.chdir('snscrape/')
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
-'''
+"""
 # Columns for tweet dataframe
 tweetDFColumns = [
-            'id', 
+    "id",
-            'user.id', 
+    "user.id",
-            'user.username',
+    "user.username",
-            'user.verified',
+    "user.verified",
-            'user.created',
+    "user.created",
-            'user.favouritesCount',
+    "user.favouritesCount",
-            'user.followersCount',
+    "user.followersCount",
-            'user.friendsCount',
+    "user.friendsCount",
-            'user.url',
+    "user.url",
-            'rawContent', 
+    "rawContent",
-            'renderedContent', 
+    "renderedContent",
-            'cashtags', 
+    "cashtags",
-            'coordinates', 
+    "coordinates",
-            'hashtags', 
+    "hashtags",
-            'inReplyToTweetId', 
+    "inReplyToTweetId",
-            'inReplyToUser', 
+    "inReplyToUser",
-            'media', 
+    "media",
-            'mentionedUsers', 
+    "mentionedUsers",
-            'links', 
+    "links",
-            'place', 
+    "place",
-            'quotedTweet', 
+    "quotedTweet",
-            'retweetedTweet', 
+    "retweetedTweet",
-            'sourceLabel', 
+    "sourceLabel",
-            'sourceUrl', 
+    "sourceUrl",
-            'url', 
+    "url",
-            'date', 
+    "date",
-            'replyCount', 
+    "replyCount",
-            'retweetCount', 
+    "retweetCount",
-            'likeCount', 
+    "likeCount",
-            'quoteCount', 
+    "quoteCount",
-            'conversationId', 
+    "conversationId",
-            'lang', 
+    "lang",
-            'source']
+    "source",
-
+]
 ##
 ## Import other files
 from funs.TimeSlice import *
@@ -140,99 +139,110 @@ from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets
 # create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
+logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
-logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
+logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
-sys.stderr = open(logfileErrors, 'w')
+sys.stderr = open(logfileErrors, "w")
-sys.stdout = open(logfilen, 'w')
+sys.stdout = open(logfilen, "w")
 ## Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
-print('Time-period-slices:')
+print("Time-period-slices:")
 for slice in time_slices:
-    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
+    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
-print('---')
+print("---")
 ## Keywords
 keywords = []
 # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
-deDupe('data/keywords-raw.txt', 'data/keywords.txt')
+deDupe("data/keywords-raw.txt", "data/keywords.txt")
 # Read the keywords from a file
-with open('data/keywords.txt', 'r') as file:
+with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
-print('---')
+print("---")
 ## Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
-accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
+accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
-alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
+alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
-print('Accounts to be scraped:')
+alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
-print(accounts)
+accounts.append(alt_accounts)
-print(alt_accounts)
+# Print accounts to be scraped
-print('---')
+print("Accounts to be scraped:")
 for i, acc in enumerate(accounts): # print 5 accounts per line
    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
    if i % 5 == 4: 
        print "\n"
 print("\n---")
 ## Scraping
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
-print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
+print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
-print('---')
+print("---")
 # Iterate over each Twitter account using multiprocessing
 with concurrent.futures.ThreadPoolExecutor() as executor:
    # List to store the scraping tasks
    tasks = []
-    
+
    for handle in accounts:
        # Iterate over each time slice
        for slice_data in time_slices:
            # ... code to prepare the slice_data ...
-            
+
            # Schedule the scraping task
            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
            tasks.append(task)
-    
+
    # Wait for all tasks to complete
    concurrent.futures.wait(tasks)
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
+print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
-## Merge CSV-Files to file_alltweets
+## Merge CSV-Files to file_alltweets.
 # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 os.chdir(path_to_tweetdfs)
-tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
+# At first check, whether all slices are present.
 tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
 for handle 
 for tweetfile in tweetfiles:
 # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
-with open(file_alltweets,"wb") as fout:
+with open(file_alltweets, "wb") as fout:
    # first file (because of the header):
    with open(tweetfiles[0], "rb") as f:
        fout.write(f.read())
-    # other files without the header:    
+    # other files without the header:
    for file in tweetfiles[1:]:
        with open(file, "rb") as f:
-            next(f) # skip the header
+            next(f)  # skip the header
            fout.write(f.read())
 os.chdir(wd)
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
+print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
 print("---")
 tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
 tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
-print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
+print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 )
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 sys.stdout.close()
-sys.stderr.close()
+sys.stderr.close()