adds alt_accounts check and removes NANs from alt_accounts. Prints accounts to output more beautifully.
This commit is contained in:
parent
5d0c41407e
commit
1a19fd407a
164
collect.py
164
collect.py
@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
"""
|
||||
Created on Thu Jun 8 01:08:21 2023
|
||||
|
||||
@author: Michael
|
||||
@ -50,7 +50,7 @@ sliced in 6 time periods (to bypass twitters limitations). It will check whether
|
||||
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
|
||||
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
|
||||
which is the final output.
|
||||
'''
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
@ -62,77 +62,76 @@ import concurrent.futures
|
||||
|
||||
## Setup directories
|
||||
# WD Michael
|
||||
wd = '/home/michael/Documents/PS/Data/collectTweets/'
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# Tweet-datafile output directory
|
||||
td = 'data/tweets/'
|
||||
td = "data/tweets/"
|
||||
|
||||
# Name of file that all tweets will be written to
|
||||
file_alltweets = 'ALL-SENATORS-TWEETS.csv'
|
||||
file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
path_to_tweetdfs = wd + td
|
||||
|
||||
## Define Timespan
|
||||
## Define Timespan
|
||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||
ts_beg = '2020-01-01T00:00:00Z' # start of scraping
|
||||
ts_end = '2023-01-03T00:00:00Z' # end of straping
|
||||
no_slices = 24 # Number of slices / time periods.
|
||||
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||
no_slices = 24 # Number of slices / time periods.
|
||||
|
||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||
maxTweets = 5000
|
||||
|
||||
# Name of logfile
|
||||
logfile = 'log/log_'
|
||||
logfile = "log/log_"
|
||||
|
||||
|
||||
## Install snscrape from local git repo to make shure that it fits the used version.
|
||||
# If snscrape is already installed, uncomment the following lines:
|
||||
'''
|
||||
"""
|
||||
import subprocess
|
||||
os.chdir('snscrape/')
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||
os.chdir(wd)
|
||||
'''
|
||||
"""
|
||||
|
||||
# Columns for tweet dataframe
|
||||
tweetDFColumns = [
|
||||
'id',
|
||||
'user.id',
|
||||
'user.username',
|
||||
'user.verified',
|
||||
'user.created',
|
||||
'user.favouritesCount',
|
||||
'user.followersCount',
|
||||
'user.friendsCount',
|
||||
'user.url',
|
||||
'rawContent',
|
||||
'renderedContent',
|
||||
'cashtags',
|
||||
'coordinates',
|
||||
'hashtags',
|
||||
'inReplyToTweetId',
|
||||
'inReplyToUser',
|
||||
'media',
|
||||
'mentionedUsers',
|
||||
'links',
|
||||
'place',
|
||||
'quotedTweet',
|
||||
'retweetedTweet',
|
||||
'sourceLabel',
|
||||
'sourceUrl',
|
||||
'url',
|
||||
'date',
|
||||
'replyCount',
|
||||
'retweetCount',
|
||||
'likeCount',
|
||||
'quoteCount',
|
||||
'conversationId',
|
||||
'lang',
|
||||
'source']
|
||||
|
||||
##
|
||||
"id",
|
||||
"user.id",
|
||||
"user.username",
|
||||
"user.verified",
|
||||
"user.created",
|
||||
"user.favouritesCount",
|
||||
"user.followersCount",
|
||||
"user.friendsCount",
|
||||
"user.url",
|
||||
"rawContent",
|
||||
"renderedContent",
|
||||
"cashtags",
|
||||
"coordinates",
|
||||
"hashtags",
|
||||
"inReplyToTweetId",
|
||||
"inReplyToUser",
|
||||
"media",
|
||||
"mentionedUsers",
|
||||
"links",
|
||||
"place",
|
||||
"quotedTweet",
|
||||
"retweetedTweet",
|
||||
"sourceLabel",
|
||||
"sourceUrl",
|
||||
"url",
|
||||
"date",
|
||||
"replyCount",
|
||||
"retweetCount",
|
||||
"likeCount",
|
||||
"quoteCount",
|
||||
"conversationId",
|
||||
"lang",
|
||||
"source",
|
||||
]
|
||||
|
||||
## Import other files
|
||||
from funs.TimeSlice import *
|
||||
@ -140,99 +139,110 @@ from funs.ClearDupes import deDupe
|
||||
from funs.Scrape import scrapeTweets
|
||||
|
||||
# create logfile & log all outputs
|
||||
logfilen = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'
|
||||
logfileErrors = logfile + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_err' + '.txt'
|
||||
sys.stderr = open(logfileErrors, 'w')
|
||||
sys.stdout = open(logfilen, 'w')
|
||||
logfilen = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt"
|
||||
logfileErrors = logfile + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_err" + ".txt"
|
||||
sys.stderr = open(logfileErrors, "w")
|
||||
sys.stdout = open(logfilen, "w")
|
||||
|
||||
## Create List of time-period-slices
|
||||
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
||||
# Print slices
|
||||
print('Time-period-slices:')
|
||||
print("Time-period-slices:")
|
||||
for slice in time_slices:
|
||||
print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])
|
||||
print('---')
|
||||
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
|
||||
print("---")
|
||||
|
||||
## Keywords
|
||||
keywords = []
|
||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||
deDupe('data/keywords-raw.txt', 'data/keywords.txt')
|
||||
deDupe("data/keywords-raw.txt", "data/keywords.txt")
|
||||
# Read the keywords from a file
|
||||
with open('data/keywords.txt', 'r') as file:
|
||||
with open("data/keywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
print('---')
|
||||
print("---")
|
||||
|
||||
## Senator Accounts
|
||||
# Get accounts & alt-accounts from Senators-Datafile
|
||||
accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
|
||||
alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
|
||||
print('Accounts to be scraped:')
|
||||
print(accounts)
|
||||
print(alt_accounts)
|
||||
print('---')
|
||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||
accounts.append(alt_accounts)
|
||||
# Print accounts to be scraped
|
||||
print("Accounts to be scraped:")
|
||||
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
|
||||
if i % 5 == 4:
|
||||
print "\n"
|
||||
print("\n---")
|
||||
|
||||
## Scraping
|
||||
timeStartScrape = datetime.now()
|
||||
print("Starting scraping at:")
|
||||
print(timeStartScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||
print('---')
|
||||
print(timeStartScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
print("---")
|
||||
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# List to store the scraping tasks
|
||||
tasks = []
|
||||
|
||||
|
||||
for handle in accounts:
|
||||
# Iterate over each time slice
|
||||
for slice_data in time_slices:
|
||||
# ... code to prepare the slice_data ...
|
||||
|
||||
|
||||
# Schedule the scraping task
|
||||
task = executor.submit(scrapeTweets, handle, slice_data, keywords, td)
|
||||
tasks.append(task)
|
||||
|
||||
|
||||
# Wait for all tasks to complete
|
||||
concurrent.futures.wait(tasks)
|
||||
|
||||
timeEndScrape = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||
print(timeEndScrape.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
|
||||
## Merge CSV-Files to file_alltweets
|
||||
## Merge CSV-Files to file_alltweets.
|
||||
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
||||
os.chdir(path_to_tweetdfs)
|
||||
tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
|
||||
# At first check, whether all slices are present.
|
||||
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
||||
for handle
|
||||
for tweetfile in tweetfiles:
|
||||
|
||||
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
||||
if file_alltweets in tweetfiles:
|
||||
tweetfiles.remove(file_alltweets)
|
||||
|
||||
# Go through all csv files and merge them into file_alltweets
|
||||
with open(file_alltweets,"wb") as fout:
|
||||
with open(file_alltweets, "wb") as fout:
|
||||
# first file (because of the header):
|
||||
with open(tweetfiles[0], "rb") as f:
|
||||
fout.write(f.read())
|
||||
# other files without the header:
|
||||
# other files without the header:
|
||||
for file in tweetfiles[1:]:
|
||||
with open(file, "rb") as f:
|
||||
next(f) # skip the header
|
||||
next(f) # skip the header
|
||||
fout.write(f.read())
|
||||
os.chdir(wd)
|
||||
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndMerge.strftime('%Y-%m-%d_%H-%M-%S'))
|
||||
print(timeEndMerge.strftime("%Y-%m-%d_%H-%M-%S"))
|
||||
print("---")
|
||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
|
||||
print(f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds")
|
||||
print(
|
||||
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
||||
)
|
||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stderr.close()
|
||||
|
Loading…
x
Reference in New Issue
Block a user