258 lines
8.8 KiB
Python
258 lines
8.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Thu Jun 8 01:08:21 2023
|
|
|
|
@author: Michael
|
|
|
|
Following files are necessary:
|
|
config.py
|
|
Used to configure everything that's needed for this script.
|
|
funs/TimeSlice.py
|
|
Function get_Tslices slices the defined timespan in config.py into N
|
|
slices. Is necessary due to possible blocking of requests by twitter.
|
|
The script will slepp for 1 second after each slice that was scraped.
|
|
funs/ClearDupes.py
|
|
Function deDupe reads each line of inFile and removes duplicate lines.
|
|
A file outFile is saved without the duplicate lines. Generates
|
|
"keywords.txt".
|
|
data/keywords-raw.txt
|
|
Contains all keywords that are used to detect whether a tweet contains
|
|
information about Covid19.
|
|
data/senators-raw.csv
|
|
Contains the senator dataset converted to csv. Is used to get the
|
|
account-names of all senators twitter accounts.
|
|
|
|
Requirements:
|
|
- snscrape 0.6.2.20230321+
|
|
- pandas 2.0+
|
|
The script will first import needed libraries.
|
|
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
|
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
|
versions of snscrape will most likely fail to scrape all tweets because of
|
|
certain rate limits or other errors that may occur.
|
|
config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
|
|
How to use:
|
|
- To run the script, first adjust the config.py file.
|
|
- config.py will check whether snscrape is already installed. If not, it will try
|
|
to install the included version automatically.
|
|
- run the script
|
|
- The whole script is expected to run without error messages except the
|
|
following:
|
|
'Stopping after 20 empty pages': indicates that no more tweets were found and
|
|
that the script skips to the next slice/account.
|
|
'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were
|
|
found in that specific time range for that specific twitter account.
|
|
|
|
The script will scrape tweets for all senators in 'data/senators-raw.csv'
|
|
sliced in 6 time periods (to bypass twitters limitations). It will check whether
|
|
a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator
|
|
in the datafile. It will then join all slices and create 'ALL-SENATORS.csv'
|
|
which is the final output.
|
|
"""
|
|
|
|
import os
|
|
import pandas as pd
|
|
import glob
|
|
import time
|
|
import sys
|
|
from datetime import datetime
|
|
import concurrent.futures
|
|
|
|
## Setup directories
|
|
# WD Michael
|
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
# WD Server
|
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
|
|
# Tweet-datafile output directory
|
|
td = "data/tweets/"
|
|
|
|
# Name of file that all tweets will be written to
|
|
file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
|
|
|
path_to_tweetdfs = wd + td
|
|
|
|
## Define Timespan
|
|
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
|
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
|
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
|
no_slices = 24 # Number of slices / time periods.
|
|
|
|
# file time format
|
|
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
|
|
|
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
|
maxTweets = 5000
|
|
|
|
# Name of logfile
|
|
logfile = wd+"log/log_"
|
|
|
|
## Install snscrape from local git repo to make shure that it fits the used version.
|
|
# If snscrape is already installed, uncomment the following lines:
|
|
"""
|
|
import subprocess
|
|
os.chdir('snscrape/')
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
|
os.chdir(wd)
|
|
"""
|
|
|
|
# Columns for tweet dataframe
|
|
tweetDFColumns = [
|
|
"id",
|
|
"user.id",
|
|
"user.username",
|
|
"user.verified",
|
|
"user.created",
|
|
"user.favouritesCount",
|
|
"user.followersCount",
|
|
"user.friendsCount",
|
|
"user.url",
|
|
"rawContent",
|
|
"renderedContent",
|
|
"cashtags",
|
|
"coordinates",
|
|
"hashtags",
|
|
"inReplyToTweetId",
|
|
"inReplyToUser",
|
|
"media",
|
|
"mentionedUsers",
|
|
"links",
|
|
"place",
|
|
"quotedTweet",
|
|
"retweetedTweet",
|
|
"sourceLabel",
|
|
"sourceUrl",
|
|
"url",
|
|
"date",
|
|
"replyCount",
|
|
"retweetCount",
|
|
"likeCount",
|
|
"quoteCount",
|
|
"conversationId",
|
|
"lang",
|
|
"source",
|
|
]
|
|
|
|
## Import other files
|
|
from funs.TimeSlice import *
|
|
from funs.ClearDupes import deDupe
|
|
from funs.Scrape import scrapeTweets
|
|
|
|
# create logfile & log all outputs
|
|
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
|
|
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
|
|
sys.stderr = open(logfileErrors, "w")
|
|
sys.stdout = open(logfilen, "w")
|
|
|
|
## Create List of time-period-slices
|
|
time_slices = get_Tslices(ts_beg, ts_end, no_slices)
|
|
# Print slices
|
|
print("Time-period-slices:")
|
|
for slice in time_slices:
|
|
print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
|
|
print("---")
|
|
|
|
## Keywords
|
|
keywords = []
|
|
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
|
deDupe("data/keywords-raw.txt", "data/keywords.txt")
|
|
# Read the keywords from a file
|
|
with open("data/keywords.txt", "r") as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
keyword = line.strip() # Remove the newline character
|
|
keywords.append(keyword)
|
|
print("---")
|
|
|
|
## Senator Accounts
|
|
# Get accounts & alt-accounts from Senators-Datafile
|
|
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
|
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
|
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
|
accounts.extend(alt_accounts)
|
|
|
|
# Print accounts to be scraped
|
|
print("Accounts to be scraped:")
|
|
for i, acc in enumerate(accounts): # print 5 accounts per line
|
|
print(f"{acc:^17}", end = "") # twitter handle max length = 15 chars
|
|
if i % 5 == 4:
|
|
print("\n")
|
|
print(f"\n{i} accounts in total.\n---")
|
|
|
|
## Scraping
|
|
timeStartScrape = datetime.now()
|
|
print("Starting scraping at:")
|
|
print(timeStartScrape.strftime(fTimeFormat))
|
|
print("---")
|
|
|
|
# Iterate over each Twitter account using multiprocessing
|
|
# Iterate over each Twitter account using multiprocessing
|
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
|
# List to store the scraping tasks
|
|
tasks = []
|
|
for handle in accounts:
|
|
# Iterate over each time slice
|
|
for slice_data in time_slices:
|
|
# ... Code to prepare the slice_data ...
|
|
# Schedule the scraping task
|
|
task = executor.submit(
|
|
scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
|
|
)
|
|
tasks.append(task)
|
|
# Wait for all tasks to complete
|
|
concurrent.futures.wait(tasks)
|
|
|
|
timeEndScrape = datetime.now()
|
|
print("---")
|
|
print("End of scraping at:")
|
|
print(timeEndScrape.strftime(fTimeFormat))
|
|
|
|
## Merge CSV-Files to file_alltweets.
|
|
# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
|
|
os.chdir(path_to_tweetdfs)
|
|
# At first check, whether all slices are present.
|
|
tweetfiles = glob.glob("*.csv") # get list of all csv files in folder - before: "*.{}".format("csv")
|
|
AllFilesList = []
|
|
for handle in accounts:
|
|
for tslice in time_slices:
|
|
suffix = tslice['suffix']
|
|
AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
|
|
with open(f"{logfile}missing-"+timeStartScrape.strftime(fTimeFormat)+".txt", "w") as fout:
|
|
for file in AllFilesList:
|
|
if file not in tweetfiles:
|
|
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
|
# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
|
|
if file_alltweets in tweetfiles:
|
|
tweetfiles.remove(file_alltweets)
|
|
# Go through all csv files and merge them into file_alltweets
|
|
if tweetfiles:
|
|
with open(file_alltweets, "wb") as fout:
|
|
# first file (because of the header):
|
|
with open(tweetfiles[0], "rb") as f:
|
|
fout.write(f.read())
|
|
# other files without the header:
|
|
for file in tweetfiles[1:]:
|
|
with open(file, "rb") as f:
|
|
next(f) # skip the header
|
|
fout.write(f.read())
|
|
os.chdir(wd)
|
|
|
|
timeEndMerge = datetime.now()
|
|
print("---")
|
|
print("End of scraping at:")
|
|
print(timeEndMerge.strftime(fTimeFormat))
|
|
print("---")
|
|
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
|
|
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
|
|
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
|
|
print(
|
|
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
|
)
|
|
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
|
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
|
|
|
sys.stdout.close()
|
|
sys.stderr.close()
|