adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully.

2023-06-23 16:54:57 +02:00
parent 5d0c41407e
commit 1a19fd407a
1 changed files with 87 additions and 77 deletions
--- a/collect.py
+++ b/collect.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-'''
+"""
 Created on Thu Jun  8 01:08:21 2023

@author: Michael
@@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether
 a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
 in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
 which is the final output.
-'''
+"""

 import os
 import pandas as pd
@@ -62,77 +62,76 @@ import concurrent.futures

 ## Setup directories
 # WD Michael
-wd = '/home/michael/Documents/PS/Data/collectTweets/'
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

 # Tweet-datafile output directory
-td = 'data/tweets/'
+td = "data/tweets/"

 # Name of file that all tweets will be written to
-file_alltweets = 'ALL-SENATORS-TWEETS.csv'
+file_alltweets = "ALL-SENATORS-TWEETS.csv"

 path_to_tweetdfs = wd + td

-## Define Timespan 
+## Define Timespan
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
-ts_beg = '2020-01-01T00:00:00Z' # start of scraping
-ts_end = '2023-01-03T00:00:00Z' # end of straping
-no_slices = 24 # Number of slices / time periods.
+ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
+ts_end = "2023-01-03T00:00:00Z"  # end of straping
+no_slices = 24  # Number of slices / time periods.

 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000

 # Name of logfile
-logfile = 'log/log_'
+logfile = "log/log_"


 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
-''' 
+""" 
 import subprocess
 os.chdir('snscrape/')
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
-'''
+"""

 # Columns for tweet dataframe
 tweetDFColumns = [
-            'id', 
-            'user.id', 
-            'user.username',
-            'user.verified',
-            'user.created',
-            'user.favouritesCount',
-            'user.followersCount',
-            'user.friendsCount',
-            'user.url',
-            'rawContent', 
-            'renderedContent', 
-            'cashtags', 
-            'coordinates', 
-            'hashtags', 
-            'inReplyToTweetId', 
-            'inReplyToUser', 
-            'media', 
-            'mentionedUsers', 
-            'links', 
-            'place', 
-            'quotedTweet', 
-            'retweetedTweet', 
-            'sourceLabel', 
-            'sourceUrl', 
-            'url', 
-            'date', 
-            'replyCount', 
-            'retweetCount', 
-            'likeCount', 
-            'quoteCount', 
-            'conversationId', 
-            'lang', 
-            'source']
-
-##
+    "id",
+    "user.id",
+    "user.username",
+    "user.verified",
+    "user.created",
+    "user.favouritesCount",
+    "user.followersCount",
+    "user.friendsCount",
+    "user.url",
+    "rawContent",
+    "renderedContent",
+    "cashtags",
+    "coordinates",
+    "hashtags",
+    "inReplyToTweetId",
+    "inReplyToUser",
+    "media",
+    "mentionedUsers",
+    "links",
+    "place",
+    "quotedTweet",
+    "retweetedTweet",
+    "sourceLabel",
+    "sourceUrl",
+    "url",
+    "date",
+    "replyCount",
+    "retweetCount",
+    "likeCount",
+    "quoteCount",
+    "conversationId",
+    "lang",
+    "source",
+]

 ## Import other files
 from funs.TimeSlice import *
@@ -140,99 +139,110 @@ from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets

 # create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
-logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
-sys.stderr = open(logfileErrors, 'w')
-sys.stdout = open(logfilen, 'w')
+logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
+logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
+sys.stderr = open(logfileErrors, "w")
+sys.stdout = open(logfilen, "w")

 ## Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
-print('Time-period-slices:')
+print("Time-period-slices:")
 for slice in time_slices:
-    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
-print('---')
+    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
+print("---")

 ## Keywords
 keywords = []
 # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
-deDupe('data/keywords-raw.txt', 'data/keywords.txt')
+deDupe("data/keywords-raw.txt", "data/keywords.txt")
 # Read the keywords from a file
-with open('data/keywords.txt', 'r') as file:
+with open("data/keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
-print('---')
+print("---")

 ## Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
-accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
-alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
-print('Accounts to be scraped:')
-print(accounts)
-print(alt_accounts)
-print('---')
+accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
+alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
+alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
+accounts.append(alt_accounts)
+# Print accounts to be scraped
+print("Accounts to be scraped:")
+for i, acc in enumerate(accounts): # print 5 accounts per line
+    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
+    if i % 5 == 4: 
+        print "\n"
+print("\n---")

 ## Scraping
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
-print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
-print('---')
+print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
+print("---")

 # Iterate over each Twitter account using multiprocessing
 with concurrent.futures.ThreadPoolExecutor() as executor:
    # List to store the scraping tasks
    tasks = []
-    
+
    for handle in accounts:
        # Iterate over each time slice
        for slice_data in time_slices:
            # ... code to prepare the slice_data ...
-            
+
            # Schedule the scraping task
            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
            tasks.append(task)
-    
+
    # Wait for all tasks to complete
    concurrent.futures.wait(tasks)

 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
+print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))

-## Merge CSV-Files to file_alltweets
+## Merge CSV-Files to file_alltweets.
 # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 os.chdir(path_to_tweetdfs)
-tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
+# At first check, whether all slices are present.
+tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
+for handle 
+for tweetfile in tweetfiles:
+     
 # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)

 # Go through all csv files and merge them into file_alltweets
-with open(file_alltweets,"wb") as fout:
+with open(file_alltweets, "wb") as fout:
    # first file (because of the header):
    with open(tweetfiles[0], "rb") as f:
        fout.write(f.read())
-    # other files without the header:    
+    # other files without the header:
    for file in tweetfiles[1:]:
        with open(file, "rb") as f:
-            next(f) # skip the header
+            next(f)  # skip the header
            fout.write(f.read())
 os.chdir(wd)

 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
-print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
+print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
 print("---")
 tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
 tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
-print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
+print(
+    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
+)
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")

 sys.stdout.close()
-sys.stderr.close()
+sys.stderr.close()