# -*- coding: utf-8 -*- ''' Created on Thu Jun 8 01:08:21 2023 @author: Michael Following files are necessary: config.py Used to configure everything that's needed for this script. funs/TimeSlice.py Function get_Tslices slices the defined timespan in config.py into N slices. Is necessary due to possible blocking of requests by twitter. The script will slepp for 1 second after each slice that was scraped. funs/ClearDupes.py Function deDupe reads each line of inFile and removes duplicate lines. A file outFile is saved without the duplicate lines. Generates "keywords.txt". data/keywords-raw.txt Contains all keywords that are used to detect whether a tweet contains information about Covid19. data/senators-raw.csv Contains the senator dataset converted to csv. Is used to get the account-names of all senators twitter accounts. Requirements: - snscrape 0.6.2.20230321+ - pandas 2.0+ The script will first import needed libraries. This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is included in 'snscrape/' as a git repository for better reproducibility. Earlier versions of snscrape will most likely fail to scrape all tweets because of certain rate limits or other errors that may occur. config.py will check whether snscrape is already installed. If not, it will try to install the included version automatically. How to use: - To run the script, first adjust the config.py file. - config.py will check whether snscrape is already installed. If not, it will try to install the included version automatically. - run the script - The whole script is expected to run without error messages except the following: 'Stopping after 20 empty pages': indicates that no more tweets were found and that the script skips to the next slice/account. 'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were found in that specific time range for that specific twitter account. The script will scrape tweets for all senators in 'data/senators-raw.csv' sliced in 6 time periods (to bypass twitters limitations). It will check whether a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' which is the final output. ''' import os import pandas as pd import glob import time ## Import other files from config import * import snscrape.modules.twitter as sntwitter from funs.TimeSlice import get_Tslices from funs.ClearDupes import deDupe ## Create List of time-period-slices time_slices = get_Tslices(ts_beg, ts_end, no_slices) # Print slices print('Time-period-slices:') for slice in time_slices: print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time']) ## Keywords keywords = [] # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' deDupe('data/keywords-raw.txt', 'data/keywords.txt') # Read the keywords from a file with open('data/keywords.txt', 'r') as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) ## Senator Accounts # Get accounts & alt-accounts from Senators-Datafile accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() ## Scraping # Iterate over each Twitter account for handle in accounts: # Iterate over each time slice for slice_data in time_slices: # define slice data variables from time_slices ts_beg = slice_data['beg_time'] ts_end = slice_data['end_time'] suffix = slice_data['suffix'] # create empty tweetlist that will be filled with tweets of current sen tweetlist = [] # statusmsg msg = f'trying to fetch tweets for {handle}{suffix}' print(msg) # Snscrape query: query = f'from:{handle} since:{ts_beg} until:{ts_end}' for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()): if i>maxTweets: break tweetlist.append([ tweet.id, tweet.user.id, tweet.user.username, tweet.user.verified, tweet.user.created, tweet.user.favouritesCount, tweet.user.followersCount, tweet.user.friendsCount, tweet.user.url, tweet.rawContent, tweet.renderedContent, tweet.cashtags, tweet.coordinates, tweet.hashtags, tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.media, tweet.mentionedUsers, tweet.links, tweet.place, tweet.quotedTweet, tweet.retweetedTweet, tweet.sourceLabel, tweet.sourceUrl, tweet.url, tweet.date, tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.conversationId, tweet.lang, tweet.source ]) # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration if len(tweetlist) == 0: msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}' print(msg) continue # convert to dataframe tweet_df = pd.DataFrame(tweetlist, columns=[ 'id', 'user.id', 'user.username', 'user.verified', 'user.created', 'user.favouritesCount', 'user.followersCount', 'user.friendsCount', 'user.url', 'rawContent', 'renderedContent', 'cashtags', 'coordinates', 'hashtags', 'inReplyToTweetId', 'inReplyToUser', 'media', 'mentionedUsers', 'links', 'place', 'quotedTweet', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'url', 'date', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang', 'source']) ## Check if tweet-text contains keyword tweet_df['contains_keyword'] = '' tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)) .str.join(',') .replace('', 'none')) ## Save two versions of the dataset, one with all fields and one without dict fields # define filepaths csv_path = f'data/tweets/T{handle}{suffix}.csv' # save short csv tweet_df.to_csv(csv_path) # sleep 1 second to not get blocked because of excessive requests time.sleep(1) ## Merge CSV-Files to file_alltweets # fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else. os.chdir(path_to_tweetdfs) tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder # check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge if file_alltweets in tweetfiles: tweetfiles.remove(file_alltweets) # Go through all csv files and merge them into file_alltweets with open(file_alltweets,"wb") as fout: # first file (because of the header): with open(tweetfiles[0], "rb") as f: fout.write(f.read()) # other files without the header: for file in tweetfiles[1:]: with open(file, "rb") as f: next(f) # skip the header fout.write(f.read()) os.chdir(wd)