adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully.
This commit is contained in:
		
							
								
								
									
										148
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										148
									
								
								collect.py
									
									
									
									
									
								
							@@ -1,5 +1,5 @@
 | 
			
		||||
# -*- coding: utf-8 -*-
 | 
			
		||||
'''
 | 
			
		||||
"""
 | 
			
		||||
Created on Thu Jun  8 01:08:21 2023
 | 
			
		||||
 | 
			
		||||
@author: Michael
 | 
			
		||||
@@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether
 | 
			
		||||
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
 | 
			
		||||
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
 | 
			
		||||
which is the final output.
 | 
			
		||||
'''
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import pandas as pd
 | 
			
		||||
@@ -62,77 +62,76 @@ import concurrent.futures
 | 
			
		||||
 | 
			
		||||
## Setup directories
 | 
			
		||||
# WD Michael
 | 
			
		||||
wd = '/home/michael/Documents/PS/Data/collectTweets/'
 | 
			
		||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
 | 
			
		||||
# WD Server
 | 
			
		||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 | 
			
		||||
 | 
			
		||||
# Tweet-datafile output directory
 | 
			
		||||
td = 'data/tweets/'
 | 
			
		||||
td = "data/tweets/"
 | 
			
		||||
 | 
			
		||||
# Name of file that all tweets will be written to
 | 
			
		||||
file_alltweets = 'ALL-SENATORS-TWEETS.csv'
 | 
			
		||||
file_alltweets = "ALL-SENATORS-TWEETS.csv"
 | 
			
		||||
 | 
			
		||||
path_to_tweetdfs = wd + td
 | 
			
		||||
 | 
			
		||||
## Define Timespan
 | 
			
		||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 | 
			
		||||
ts_beg = '2020-01-01T00:00:00Z' # start of scraping
 | 
			
		||||
ts_end = '2023-01-03T00:00:00Z' # end of straping
 | 
			
		||||
ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 | 
			
		||||
ts_end = "2023-01-03T00:00:00Z"  # end of straping
 | 
			
		||||
no_slices = 24  # Number of slices / time periods.
 | 
			
		||||
 | 
			
		||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
 | 
			
		||||
maxTweets = 5000
 | 
			
		||||
 | 
			
		||||
# Name of logfile
 | 
			
		||||
logfile = 'log/log_'
 | 
			
		||||
logfile = "log/log_"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Install snscrape from local git repo to make shure that it fits the used version.
 | 
			
		||||
# If snscrape is already installed, uncomment the following lines:
 | 
			
		||||
''' 
 | 
			
		||||
""" 
 | 
			
		||||
import subprocess
 | 
			
		||||
os.chdir('snscrape/')
 | 
			
		||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 | 
			
		||||
os.chdir(wd) 
 | 
			
		||||
'''
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
# Columns for tweet dataframe
 | 
			
		||||
tweetDFColumns = [
 | 
			
		||||
            'id', 
 | 
			
		||||
            'user.id', 
 | 
			
		||||
            'user.username',
 | 
			
		||||
            'user.verified',
 | 
			
		||||
            'user.created',
 | 
			
		||||
            'user.favouritesCount',
 | 
			
		||||
            'user.followersCount',
 | 
			
		||||
            'user.friendsCount',
 | 
			
		||||
            'user.url',
 | 
			
		||||
            'rawContent', 
 | 
			
		||||
            'renderedContent', 
 | 
			
		||||
            'cashtags', 
 | 
			
		||||
            'coordinates', 
 | 
			
		||||
            'hashtags', 
 | 
			
		||||
            'inReplyToTweetId', 
 | 
			
		||||
            'inReplyToUser', 
 | 
			
		||||
            'media', 
 | 
			
		||||
            'mentionedUsers', 
 | 
			
		||||
            'links', 
 | 
			
		||||
            'place', 
 | 
			
		||||
            'quotedTweet', 
 | 
			
		||||
            'retweetedTweet', 
 | 
			
		||||
            'sourceLabel', 
 | 
			
		||||
            'sourceUrl', 
 | 
			
		||||
            'url', 
 | 
			
		||||
            'date', 
 | 
			
		||||
            'replyCount', 
 | 
			
		||||
            'retweetCount', 
 | 
			
		||||
            'likeCount', 
 | 
			
		||||
            'quoteCount', 
 | 
			
		||||
            'conversationId', 
 | 
			
		||||
            'lang', 
 | 
			
		||||
            'source']
 | 
			
		||||
 | 
			
		||||
##
 | 
			
		||||
    "id",
 | 
			
		||||
    "user.id",
 | 
			
		||||
    "user.username",
 | 
			
		||||
    "user.verified",
 | 
			
		||||
    "user.created",
 | 
			
		||||
    "user.favouritesCount",
 | 
			
		||||
    "user.followersCount",
 | 
			
		||||
    "user.friendsCount",
 | 
			
		||||
    "user.url",
 | 
			
		||||
    "rawContent",
 | 
			
		||||
    "renderedContent",
 | 
			
		||||
    "cashtags",
 | 
			
		||||
    "coordinates",
 | 
			
		||||
    "hashtags",
 | 
			
		||||
    "inReplyToTweetId",
 | 
			
		||||
    "inReplyToUser",
 | 
			
		||||
    "media",
 | 
			
		||||
    "mentionedUsers",
 | 
			
		||||
    "links",
 | 
			
		||||
    "place",
 | 
			
		||||
    "quotedTweet",
 | 
			
		||||
    "retweetedTweet",
 | 
			
		||||
    "sourceLabel",
 | 
			
		||||
    "sourceUrl",
 | 
			
		||||
    "url",
 | 
			
		||||
    "date",
 | 
			
		||||
    "replyCount",
 | 
			
		||||
    "retweetCount",
 | 
			
		||||
    "likeCount",
 | 
			
		||||
    "quoteCount",
 | 
			
		||||
    "conversationId",
 | 
			
		||||
    "lang",
 | 
			
		||||
    "source",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
## Import other files
 | 
			
		||||
from funs.TimeSlice import *
 | 
			
		||||
@@ -140,45 +139,50 @@ from funs.ClearDupes import deDupe
 | 
			
		||||
from funs.Scrape import scrapeTweets
 | 
			
		||||
 | 
			
		||||
# create logfile & log all outputs
 | 
			
		||||
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
 | 
			
		||||
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
 | 
			
		||||
sys.stderr = open(logfileErrors, 'w')
 | 
			
		||||
sys.stdout = open(logfilen, 'w')
 | 
			
		||||
logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
 | 
			
		||||
logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
 | 
			
		||||
sys.stderr = open(logfileErrors, "w")
 | 
			
		||||
sys.stdout = open(logfilen, "w")
 | 
			
		||||
 | 
			
		||||
## Create List of time-period-slices
 | 
			
		||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 | 
			
		||||
# Print slices
 | 
			
		||||
print('Time-period-slices:')
 | 
			
		||||
print("Time-period-slices:")
 | 
			
		||||
for slice in time_slices:
 | 
			
		||||
    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
 | 
			
		||||
print('---')
 | 
			
		||||
    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
 | 
			
		||||
print("---")
 | 
			
		||||
 | 
			
		||||
## Keywords
 | 
			
		||||
keywords = []
 | 
			
		||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
 | 
			
		||||
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
 | 
			
		||||
deDupe("data/keywords-raw.txt", "data/keywords.txt")
 | 
			
		||||
# Read the keywords from a file
 | 
			
		||||
with open('data/keywords.txt', 'r') as file:
 | 
			
		||||
with open("data/keywords.txt", "r") as file:
 | 
			
		||||
    lines = file.readlines()
 | 
			
		||||
    for line in lines:
 | 
			
		||||
        keyword = line.strip()  # Remove the newline character
 | 
			
		||||
        keywords.append(keyword)
 | 
			
		||||
print('---')
 | 
			
		||||
print("---")
 | 
			
		||||
 | 
			
		||||
## Senator Accounts
 | 
			
		||||
# Get accounts & alt-accounts from Senators-Datafile
 | 
			
		||||
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
 | 
			
		||||
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
 | 
			
		||||
print('Accounts to be scraped:')
 | 
			
		||||
print(accounts)
 | 
			
		||||
print(alt_accounts)
 | 
			
		||||
print('---')
 | 
			
		||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 | 
			
		||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
 | 
			
		||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
 | 
			
		||||
accounts.append(alt_accounts)
 | 
			
		||||
# Print accounts to be scraped
 | 
			
		||||
print("Accounts to be scraped:")
 | 
			
		||||
for i, acc in enumerate(accounts): # print 5 accounts per line
 | 
			
		||||
    print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
 | 
			
		||||
    if i % 5 == 4: 
 | 
			
		||||
        print "\n"
 | 
			
		||||
print("\n---")
 | 
			
		||||
 | 
			
		||||
## Scraping
 | 
			
		||||
timeStartScrape = datetime.now()
 | 
			
		||||
print("Starting scraping at:")
 | 
			
		||||
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
			
		||||
print('---')
 | 
			
		||||
print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
			
		||||
print("---")
 | 
			
		||||
 | 
			
		||||
# Iterate over each Twitter account using multiprocessing
 | 
			
		||||
with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
@@ -200,18 +204,22 @@ with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
			
		||||
timeEndScrape = datetime.now()
 | 
			
		||||
print("---")
 | 
			
		||||
print("End of scraping at:")
 | 
			
		||||
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
			
		||||
print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
			
		||||
 | 
			
		||||
## Merge CSV-Files to file_alltweets
 | 
			
		||||
## Merge CSV-Files to file_alltweets.
 | 
			
		||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 | 
			
		||||
os.chdir(path_to_tweetdfs)
 | 
			
		||||
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
 | 
			
		||||
# At first check, whether all slices are present.
 | 
			
		||||
tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
 | 
			
		||||
for handle 
 | 
			
		||||
for tweetfile in tweetfiles:
 | 
			
		||||
     
 | 
			
		||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
 | 
			
		||||
if file_alltweets in tweetfiles:
 | 
			
		||||
    tweetfiles.remove(file_alltweets)
 | 
			
		||||
 | 
			
		||||
# Go through all csv files and merge them into file_alltweets
 | 
			
		||||
with open(file_alltweets,"wb") as fout:
 | 
			
		||||
with open(file_alltweets, "wb") as fout:
 | 
			
		||||
    # first file (because of the header):
 | 
			
		||||
    with open(tweetfiles[0], "rb") as f:
 | 
			
		||||
        fout.write(f.read())
 | 
			
		||||
@@ -225,12 +233,14 @@ os.chdir(wd)
 | 
			
		||||
timeEndMerge = datetime.now()
 | 
			
		||||
print("---")
 | 
			
		||||
print("End of scraping at:")
 | 
			
		||||
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
 | 
			
		||||
print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
 | 
			
		||||
print("---")
 | 
			
		||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
 | 
			
		||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
 | 
			
		||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
 | 
			
		||||
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
 | 
			
		||||
print(
 | 
			
		||||
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 | 
			
		||||
)
 | 
			
		||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 | 
			
		||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user