diff --git a/.gitignore b/.gitignore index e9f93d1..93f6285 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +**/log* +**/*-slice*.csv /ALL-SENATORS-LONG.csv /ALL-SENATORS.csv /collect2.py diff --git a/collect.py b/collect.py index 79f2d6a..9eb711c 100644 --- a/collect.py +++ b/collect.py @@ -59,8 +59,79 @@ import time import sys from datetime import datetime +## Setup directories +# WD Michael +wd = '/home/michael/Documents/PS/Data/collectTweets/' +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# Tweet-datafile output directory +td = 'data/tweets/' + +# Name of file that all tweets will be written to +file_alltweets = 'ALL-SENATORS-TWEETS.csv' + +path_to_tweetdfs = wd + td + +## Define Timespan +# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) +ts_beg = '2020-01-01T00:00:00Z' # start of scraping +ts_end = '2023-01-03T00:00:00Z' # end of straping +no_slices = 24 # Number of slices / time periods. + +# Maximum tweets to be scraped by snscrape. Can be left untouched. +maxTweets = 5000 + +# Name of logfile +logfile = 'log/log_' + + +## Install snscrape from local git repo to make shure that it fits the used version. +# If snscrape is already installed, uncomment the following lines: +''' +import subprocess +os.chdir('snscrape/') +subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) +os.chdir(wd) +''' + +# Columns for tweet dataframe +tweetDFColumns = [ + 'id', + 'user.id', + 'user.username', + 'user.verified', + 'user.created', + 'user.favouritesCount', + 'user.followersCount', + 'user.friendsCount', + 'user.url', + 'rawContent', + 'renderedContent', + 'cashtags', + 'coordinates', + 'hashtags', + 'inReplyToTweetId', + 'inReplyToUser', + 'media', + 'mentionedUsers', + 'links', + 'place', + 'quotedTweet', + 'retweetedTweet', + 'sourceLabel', + 'sourceUrl', + 'url', + 'date', + 'replyCount', + 'retweetCount', + 'likeCount', + 'quoteCount', + 'conversationId', + 'lang', + 'source'] + ## Import other files -from config import * import snscrape.modules.twitter as sntwitter from funs.TimeSlice import * from funs.ClearDupes import deDupe @@ -115,110 +186,49 @@ for handle in accounts: ts_beg = slice_data['beg_time'] ts_end = slice_data['end_time'] suffix = slice_data['suffix'] + tweetFileName = "Tweets-{handle}{suffix}.csv" # create empty tweetlist that will be filled with tweets of current sen - tweetlist = [] + TweetList = [] # statusmsg - msg = f'trying to fetch tweets for {handle}{suffix}' - print(msg) + print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}') # Snscrape query: query = f'from:{handle} since:{ts_beg} until:{ts_end}' for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()): + singleTweetList = [] if i>maxTweets: break - tweetlist.append([ - tweet.id, - tweet.user.id, - tweet.user.username, - tweet.user.verified, - tweet.user.created, - tweet.user.favouritesCount, - tweet.user.followersCount, - tweet.user.friendsCount, - tweet.user.url, - tweet.rawContent, - tweet.renderedContent, - tweet.cashtags, - tweet.coordinates, - tweet.hashtags, - tweet.inReplyToTweetId, - tweet.inReplyToUser, - tweet.media, - tweet.mentionedUsers, - tweet.links, - tweet.place, - tweet.quotedTweet, - tweet.retweetedTweet, - tweet.sourceLabel, - tweet.sourceUrl, - tweet.url, - tweet.date, - tweet.replyCount, - tweet.retweetCount, - tweet.likeCount, - tweet.quoteCount, - tweet.conversationId, - tweet.lang, - tweet.source - ]) + # get tweet vars from tweetDFColumns and append to singleTweetList + # which will then be appended to TweetList. TweetList contains all tweets of the current slice. + for col in tweetDFColumns: + singleTweetList.append(eval(f'tweet.{col}')) + TweetList.append(singleTweetList) # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration - if len(tweetlist) == 0: + if len(TweetList) == 0: msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}' + open(file, 'a').close() print(msg) continue + print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}') + # convert to dataframe - tweet_df = pd.DataFrame(tweetlist, columns=[ - 'id', - 'user.id', - 'user.username', - 'user.verified', - 'user.created', - 'user.favouritesCount', - 'user.followersCount', - 'user.friendsCount', - 'user.url', - 'rawContent', - 'renderedContent', - 'cashtags', - 'coordinates', - 'hashtags', - 'inReplyToTweetId', - 'inReplyToUser', - 'media', - 'mentionedUsers', - 'links', - 'place', - 'quotedTweet', - 'retweetedTweet', - 'sourceLabel', - 'sourceUrl', - 'url', - 'date', - 'replyCount', - 'retweetCount', - 'likeCount', - 'quoteCount', - 'conversationId', - 'lang', - 'source']) + tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns) ## Check if tweet-text contains keyword tweet_df['contains_keyword'] = '' - tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)) - .str.join(',') - .replace('', 'none')) + tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')) ## Save two versions of the dataset, one with all fields and one without dict fields # define filepaths - csv_path = f'data/tweets/T{handle}{suffix}.csv' + csv_path = td + tweetFileName # save short csv tweet_df.to_csv(csv_path) # sleep 1 second to not get blocked because of excessive requests - time.sleep(1) + time.sleep(0.5) -timeEndScrape = datetime.now() +timeEndScrape = datetime.now()tweetFileName print("---") print("End of scraping at:") print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S')) diff --git a/config.py b/config.py deleted file mode 100644 index 4adbb90..0000000 --- a/config.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Wed Jun 21 13:58:42 2023 - -@author: michael -''' - -## Setup directories -# WD Michael -wd = '/home/michael/Documents/PS/Data/collectTweets/' -# WD Server -# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' - -# Tweet-datafile output directory -td = 'data/tweets/' - -# Name of file that all tweets will be written to -file_alltweets = 'ALL-SENATORS-TWEETS.csv' - -path_to_tweetdfs = wd + td - -## Define Timespan -# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) -ts_beg = '2020-01-01T00:00:00Z' # start of scraping -ts_end = '2023-01-03T00:00:00Z' # end of straping -no_slices = 24 # Number of slices / time periods. - -# Maximum tweets to be scraped by snscrape. Can be left untouched. -maxTweets = 5000 - -# Name of logfile -logfile = 'log/log_' - - -## Install snscrape from local git repo to make shure that it fits the used version. -# If snscrape is already installed, uncomment the following lines: -''' -import subprocess -os.chdir('snscrape/') -subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) -os.chdir(wd) -''' - - diff --git a/funs/TimeSlice.py b/funs/TimeSlice.py index ead4dda..14a1ea8 100644 --- a/funs/TimeSlice.py +++ b/funs/TimeSlice.py @@ -21,4 +21,12 @@ def get_Tslices(ts_beg, ts_end, no_slices): 'end_time': (ts_beg + ts_dif * i + ts_dif - timedelta(microseconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ'), 'suffix': f'-slice{i+1}' }) - return time_slices \ No newline at end of file + return time_slices + +# For log time conversions (seconds to days, hours, minutes) +def convertTime(duration): + days, seconds = duration.days, duration.seconds + hours = days * 24 + seconds // 3600 + minutes = (seconds % 3600) // 60 + seconds = (seconds % 60) + return hours, minutes, seconds \ No newline at end of file