adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully.
This commit is contained in:
		
							
								
								
									
										164
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										164
									
								
								collect.py
									
									
									
									
									
								
							@@ -1,5 +1,5 @@
 | 
				
			|||||||
# -*- coding: utf-8 -*-
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
'''
 | 
					"""
 | 
				
			||||||
Created on Thu Jun  8 01:08:21 2023
 | 
					Created on Thu Jun  8 01:08:21 2023
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@author: Michael
 | 
					@author: Michael
 | 
				
			||||||
@@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether
 | 
				
			|||||||
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
 | 
					a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
 | 
				
			||||||
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
 | 
					in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
 | 
				
			||||||
which is the final output.
 | 
					which is the final output.
 | 
				
			||||||
'''
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
@@ -62,77 +62,76 @@ import concurrent.futures
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
## Setup directories
 | 
					## Setup directories
 | 
				
			||||||
# WD Michael
 | 
					# WD Michael
 | 
				
			||||||
wd = '/home/michael/Documents/PS/Data/collectTweets/'
 | 
					wd = "/home/michael/Documents/PS/Data/collectTweets/"
 | 
				
			||||||
# WD Server
 | 
					# WD Server
 | 
				
			||||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 | 
					# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Tweet-datafile output directory
 | 
					# Tweet-datafile output directory
 | 
				
			||||||
td = 'data/tweets/'
 | 
					td = "data/tweets/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Name of file that all tweets will be written to
 | 
					# Name of file that all tweets will be written to
 | 
				
			||||||
file_alltweets = 'ALL-SENATORS-TWEETS.csv'
 | 
					file_alltweets = "ALL-SENATORS-TWEETS.csv"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
path_to_tweetdfs = wd + td
 | 
					path_to_tweetdfs = wd + td
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Define Timespan 
 | 
					## Define Timespan
 | 
				
			||||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 | 
					# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 | 
				
			||||||
ts_beg = '2020-01-01T00:00:00Z' # start of scraping
 | 
					ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 | 
				
			||||||
ts_end = '2023-01-03T00:00:00Z' # end of straping
 | 
					ts_end = "2023-01-03T00:00:00Z"  # end of straping
 | 
				
			||||||
no_slices = 24 # Number of slices / time periods.
 | 
					no_slices = 24  # Number of slices / time periods.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
 | 
					# Maximum tweets to be scraped by snscrape. Can be left untouched.
 | 
				
			||||||
maxTweets = 5000
 | 
					maxTweets = 5000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Name of logfile
 | 
					# Name of logfile
 | 
				
			||||||
logfile = 'log/log_'
 | 
					logfile = "log/log_"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Install snscrape from local git repo to make shure that it fits the used version.
 | 
					## Install snscrape from local git repo to make shure that it fits the used version.
 | 
				
			||||||
# If snscrape is already installed, uncomment the following lines:
 | 
					# If snscrape is already installed, uncomment the following lines:
 | 
				
			||||||
''' 
 | 
					""" 
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
os.chdir('snscrape/')
 | 
					os.chdir('snscrape/')
 | 
				
			||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 | 
					subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 | 
				
			||||||
os.chdir(wd) 
 | 
					os.chdir(wd) 
 | 
				
			||||||
'''
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Columns for tweet dataframe
 | 
					# Columns for tweet dataframe
 | 
				
			||||||
tweetDFColumns = [
 | 
					tweetDFColumns = [
 | 
				
			||||||
            'id', 
 | 
					    "id",
 | 
				
			||||||
            'user.id', 
 | 
					    "user.id",
 | 
				
			||||||
            'user.username',
 | 
					    "user.username",
 | 
				
			||||||
            'user.verified',
 | 
					    "user.verified",
 | 
				
			||||||
            'user.created',
 | 
					    "user.created",
 | 
				
			||||||
            'user.favouritesCount',
 | 
					    "user.favouritesCount",
 | 
				
			||||||
            'user.followersCount',
 | 
					    "user.followersCount",
 | 
				
			||||||
            'user.friendsCount',
 | 
					    "user.friendsCount",
 | 
				
			||||||
            'user.url',
 | 
					    "user.url",
 | 
				
			||||||
            'rawContent', 
 | 
					    "rawContent",
 | 
				
			||||||
            'renderedContent', 
 | 
					    "renderedContent",
 | 
				
			||||||
            'cashtags', 
 | 
					    "cashtags",
 | 
				
			||||||
            'coordinates', 
 | 
					    "coordinates",
 | 
				
			||||||
            'hashtags', 
 | 
					    "hashtags",
 | 
				
			||||||
            'inReplyToTweetId', 
 | 
					    "inReplyToTweetId",
 | 
				
			||||||
            'inReplyToUser', 
 | 
					    "inReplyToUser",
 | 
				
			||||||
            'media', 
 | 
					    "media",
 | 
				
			||||||
            'mentionedUsers', 
 | 
					    "mentionedUsers",
 | 
				
			||||||
            'links', 
 | 
					    "links",
 | 
				
			||||||
            'place', 
 | 
					    "place",
 | 
				
			||||||
            'quotedTweet', 
 | 
					    "quotedTweet",
 | 
				
			||||||
            'retweetedTweet', 
 | 
					    "retweetedTweet",
 | 
				
			||||||
            'sourceLabel', 
 | 
					    "sourceLabel",
 | 
				
			||||||
            'sourceUrl', 
 | 
					    "sourceUrl",
 | 
				
			||||||
            'url', 
 | 
					    "url",
 | 
				
			||||||
            'date', 
 | 
					    "date",
 | 
				
			||||||
            'replyCount', 
 | 
					    "replyCount",
 | 
				
			||||||
            'retweetCount', 
 | 
					    "retweetCount",
 | 
				
			||||||
            'likeCount', 
 | 
					    "likeCount",
 | 
				
			||||||
            'quoteCount', 
 | 
					    "quoteCount",
 | 
				
			||||||
            'conversationId', 
 | 
					    "conversationId",
 | 
				
			||||||
            'lang', 
 | 
					    "lang",
 | 
				
			||||||
            'source']
 | 
					    "source",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
##
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Import other files
 | 
					## Import other files
 | 
				
			||||||
from funs.TimeSlice import *
 | 
					from funs.TimeSlice import *
 | 
				
			||||||
@@ -140,99 +139,110 @@ from funs.ClearDupes import deDupe
 | 
				
			|||||||
from funs.Scrape import scrapeTweets
 | 
					from funs.Scrape import scrapeTweets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# create logfile & log all outputs
 | 
					# create logfile & log all outputs
 | 
				
			||||||
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
 | 
					logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
 | 
				
			||||||
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
 | 
					logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
 | 
				
			||||||
sys.stderr = open(logfileErrors, 'w')
 | 
					sys.stderr = open(logfileErrors, "w")
 | 
				
			||||||
sys.stdout = open(logfilen, 'w')
 | 
					sys.stdout = open(logfilen, "w")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Create List of time-period-slices
 | 
					## Create List of time-period-slices
 | 
				
			||||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 | 
					time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 | 
				
			||||||
# Print slices
 | 
					# Print slices
 | 
				
			||||||
print('Time-period-slices:')
 | 
					print("Time-period-slices:")
 | 
				
			||||||
for slice in time_slices:
 | 
					for slice in time_slices:
 | 
				
			||||||
    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
 | 
					    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
 | 
				
			||||||
print('---')
 | 
					print("---")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Keywords
 | 
					## Keywords
 | 
				
			||||||
keywords = []
 | 
					keywords = []
 | 
				
			||||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
 | 
					# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
 | 
				
			||||||
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
 | 
					deDupe("data/keywords-raw.txt", "data/keywords.txt")
 | 
				
			||||||
# Read the keywords from a file
 | 
					# Read the keywords from a file
 | 
				
			||||||
with open('data/keywords.txt', 'r') as file:
 | 
					with open("data/keywords.txt", "r") as file:
 | 
				
			||||||
    lines = file.readlines()
 | 
					    lines = file.readlines()
 | 
				
			||||||
    for line in lines:
 | 
					    for line in lines:
 | 
				
			||||||
        keyword = line.strip()  # Remove the newline character
 | 
					        keyword = line.strip()  # Remove the newline character
 | 
				
			||||||
        keywords.append(keyword)
 | 
					        keywords.append(keyword)
 | 
				
			||||||
print('---')
 | 
					print("---")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Senator Accounts
 | 
					## Senator Accounts
 | 
				
			||||||
# Get accounts & alt-accounts from Senators-Datafile
 | 
					# Get accounts & alt-accounts from Senators-Datafile
 | 
				
			||||||
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
 | 
					accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 | 
				
			||||||
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
 | 
					alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 | 
				
			||||||
print('Accounts to be scraped:')
 | 
					alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
 | 
				
			||||||
print(accounts)
 | 
					accounts.append(alt_accounts)
 | 
				
			||||||
print(alt_accounts)
 | 
					# Print accounts to be scraped
 | 
				
			||||||
print('---')
 | 
					print("Accounts to be scraped:")
 | 
				
			||||||
 | 
					for i, acc in enumerate(accounts): # print 5 accounts per line
 | 
				
			||||||
 | 
					    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
 | 
				
			||||||
 | 
					    if i % 5 == 4: 
 | 
				
			||||||
 | 
					        print "\n"
 | 
				
			||||||
 | 
					print("\n---")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Scraping
 | 
					## Scraping
 | 
				
			||||||
timeStartScrape = datetime.now()
 | 
					timeStartScrape = datetime.now()
 | 
				
			||||||
print("Starting scraping at:")
 | 
					print("Starting scraping at:")
 | 
				
			||||||
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
					print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
				
			||||||
print('---')
 | 
					print("---")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Iterate over each Twitter account using multiprocessing
 | 
					# Iterate over each Twitter account using multiprocessing
 | 
				
			||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
					with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
				
			||||||
    # List to store the scraping tasks
 | 
					    # List to store the scraping tasks
 | 
				
			||||||
    tasks = []
 | 
					    tasks = []
 | 
				
			||||||
    
 | 
					
 | 
				
			||||||
    for handle in accounts:
 | 
					    for handle in accounts:
 | 
				
			||||||
        # Iterate over each time slice
 | 
					        # Iterate over each time slice
 | 
				
			||||||
        for slice_data in time_slices:
 | 
					        for slice_data in time_slices:
 | 
				
			||||||
            # ... code to prepare the slice_data ...
 | 
					            # ... code to prepare the slice_data ...
 | 
				
			||||||
            
 | 
					
 | 
				
			||||||
            # Schedule the scraping task
 | 
					            # Schedule the scraping task
 | 
				
			||||||
            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
 | 
					            task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
 | 
				
			||||||
            tasks.append(task)
 | 
					            tasks.append(task)
 | 
				
			||||||
    
 | 
					
 | 
				
			||||||
    # Wait for all tasks to complete
 | 
					    # Wait for all tasks to complete
 | 
				
			||||||
    concurrent.futures.wait(tasks)
 | 
					    concurrent.futures.wait(tasks)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
timeEndScrape = datetime.now()
 | 
					timeEndScrape = datetime.now()
 | 
				
			||||||
print("---")
 | 
					print("---")
 | 
				
			||||||
print("End of scraping at:")
 | 
					print("End of scraping at:")
 | 
				
			||||||
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
					print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Merge CSV-Files to file_alltweets
 | 
					## Merge CSV-Files to file_alltweets.
 | 
				
			||||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 | 
					# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 | 
				
			||||||
os.chdir(path_to_tweetdfs)
 | 
					os.chdir(path_to_tweetdfs)
 | 
				
			||||||
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
 | 
					# At first check, whether all slices are present.
 | 
				
			||||||
 | 
					tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
 | 
				
			||||||
 | 
					for handle 
 | 
				
			||||||
 | 
					for tweetfile in tweetfiles:
 | 
				
			||||||
 | 
					     
 | 
				
			||||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 | 
					# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 | 
				
			||||||
if file_alltweets in tweetfiles:
 | 
					if file_alltweets in tweetfiles:
 | 
				
			||||||
    tweetfiles.remove(file_alltweets)
 | 
					    tweetfiles.remove(file_alltweets)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Go through all csv files and merge them into file_alltweets
 | 
					# Go through all csv files and merge them into file_alltweets
 | 
				
			||||||
with open(file_alltweets,"wb") as fout:
 | 
					with open(file_alltweets, "wb") as fout:
 | 
				
			||||||
    # first file (because of the header):
 | 
					    # first file (because of the header):
 | 
				
			||||||
    with open(tweetfiles[0], "rb") as f:
 | 
					    with open(tweetfiles[0], "rb") as f:
 | 
				
			||||||
        fout.write(f.read())
 | 
					        fout.write(f.read())
 | 
				
			||||||
    # other files without the header:    
 | 
					    # other files without the header:
 | 
				
			||||||
    for file in tweetfiles[1:]:
 | 
					    for file in tweetfiles[1:]:
 | 
				
			||||||
        with open(file, "rb") as f:
 | 
					        with open(file, "rb") as f:
 | 
				
			||||||
            next(f) # skip the header
 | 
					            next(f)  # skip the header
 | 
				
			||||||
            fout.write(f.read())
 | 
					            fout.write(f.read())
 | 
				
			||||||
os.chdir(wd)
 | 
					os.chdir(wd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
timeEndMerge = datetime.now()
 | 
					timeEndMerge = datetime.now()
 | 
				
			||||||
print("---")
 | 
					print("---")
 | 
				
			||||||
print("End of scraping at:")
 | 
					print("End of scraping at:")
 | 
				
			||||||
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
					print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
				
			||||||
print("---")
 | 
					print("---")
 | 
				
			||||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 | 
					tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 | 
				
			||||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
 | 
					tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
 | 
				
			||||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
 | 
					tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
 | 
				
			||||||
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
 | 
					print(
 | 
				
			||||||
 | 
					    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 | 
					print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 | 
				
			||||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 | 
					print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
sys.stdout.close()
 | 
					sys.stdout.close()
 | 
				
			||||||
sys.stderr.close()
 | 
					sys.stderr.close()
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user