adds

2023-06-23 15:57:31 +02:00
parent 599202ae4d
commit 88c016a2a6
4 changed files with 101 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 **/log*
 **/*-slice*.csv
 /ALL-SENATORS-LONG.csv
 /ALL-SENATORS.csv
 /collect2.py
--- a/collect.py
+++ b/collect.py
@ -59,8 +59,79 @@ import time
 import sys
 from datetime import datetime
 ## Setup directories
 # WD Michael
 wd = '/home/michael/Documents/PS/Data/collectTweets/'
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # Tweet-datafile output directory
 td = 'data/tweets/'
 # Name of file that all tweets will be written to
 file_alltweets = 'ALL-SENATORS-TWEETS.csv'
 path_to_tweetdfs = wd + td
 ## Define Timespan 
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = '2020-01-01T00:00:00Z' # start of scraping
 ts_end = '2023-01-03T00:00:00Z' # end of straping
 no_slices = 24 # Number of slices / time periods.
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 # Name of logfile
 logfile = 'log/log_'
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
 ''' 
 import subprocess
 os.chdir('snscrape/')
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
 '''
 # Columns for tweet dataframe
 tweetDFColumns = [
            'id', 
            'user.id', 
            'user.username',
            'user.verified',
            'user.created',
            'user.favouritesCount',
            'user.followersCount',
            'user.friendsCount',
            'user.url',
            'rawContent', 
            'renderedContent', 
            'cashtags', 
            'coordinates', 
            'hashtags', 
            'inReplyToTweetId', 
            'inReplyToUser', 
            'media', 
            'mentionedUsers', 
            'links', 
            'place', 
            'quotedTweet', 
            'retweetedTweet', 
            'sourceLabel', 
            'sourceUrl', 
            'url', 
            'date', 
            'replyCount', 
            'retweetCount', 
            'likeCount', 
            'quoteCount', 
            'conversationId', 
            'lang', 
            'source']
 ## Import other files
 from config import *
 import snscrape.modules.twitter as sntwitter
 from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
@ -115,110 +186,49 @@ for handle in accounts:
        ts_beg = slice_data['beg_time']
        ts_end = slice_data['end_time']
        suffix = slice_data['suffix']
        tweetFileName = "Tweets-{handle}{suffix}.csv"
        # create empty tweetlist that will be filled with tweets of current sen
-        tweetlist = []
+        TweetList = []
        # statusmsg
-        msg = f'trying to fetch tweets for {handle}{suffix}'
+        print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}')
        print(msg)
        # Snscrape query:
        query = f'from:{handle} since:{ts_beg} until:{ts_end}'
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
            singleTweetList = []
            if i>maxTweets:
                break
-            tweetlist.append([
+            # get tweet vars from tweetDFColumns and append to singleTweetList
-                tweet.id,
+            # which will then be appended to TweetList. TweetList contains all tweets of the current slice.
-                tweet.user.id,
+            for col in tweetDFColumns:
-                tweet.user.username,
+                singleTweetList.append(eval(f'tweet.{col}')) 
-                tweet.user.verified,
+            TweetList.append(singleTweetList)
                tweet.user.created,
                tweet.user.favouritesCount,
                tweet.user.followersCount,
                tweet.user.friendsCount,
                tweet.user.url,
                tweet.rawContent,
                tweet.renderedContent,
                tweet.cashtags,
                tweet.coordinates,
                tweet.hashtags,
                tweet.inReplyToTweetId,
                tweet.inReplyToUser,
                tweet.media,
                tweet.mentionedUsers,
                tweet.links,
                tweet.place,
                tweet.quotedTweet,
                tweet.retweetedTweet,
                tweet.sourceLabel,
                tweet.sourceUrl,
                tweet.url,
                tweet.date,
                tweet.replyCount,
                tweet.retweetCount,
                tweet.likeCount,
                tweet.quoteCount,
                tweet.conversationId,
                tweet.lang,
                tweet.source
            ])
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
-        if len(tweetlist) == 0:
+        if len(TweetList) == 0:
            msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
            open(file, 'a').close()
            print(msg)
            continue
        print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}')
        # convert to dataframe
-        tweet_df = pd.DataFrame(tweetlist, columns=[
+        tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns)
            'id', 
            'user.id', 
            'user.username',
            'user.verified',
            'user.created',
            'user.favouritesCount',
            'user.followersCount',
            'user.friendsCount',
            'user.url',
            'rawContent', 
            'renderedContent', 
            'cashtags', 
            'coordinates', 
            'hashtags', 
            'inReplyToTweetId', 
            'inReplyToUser', 
            'media', 
            'mentionedUsers', 
            'links', 
            'place', 
            'quotedTweet', 
            'retweetedTweet', 
            'sourceLabel', 
            'sourceUrl', 
            'url', 
            'date', 
            'replyCount', 
            'retweetCount', 
            'likeCount', 
            'quoteCount', 
            'conversationId', 
            'lang', 
            'source'])
        ## Check if tweet-text contains keyword
        tweet_df['contains_keyword'] = ''
-        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
+        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none'))
                                              .str.join(',')
                                              .replace('', 'none'))
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
-        csv_path = f'data/tweets/T{handle}{suffix}.csv'
+        csv_path = td + tweetFileName
        # save short csv
        tweet_df.to_csv(csv_path)
        # sleep 1 second to not get blocked because of excessive requests
-        time.sleep(1)
+        time.sleep(0.5)
-timeEndScrape = datetime.now()
+timeEndScrape = datetime.now()tweetFileName
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
--- a/config.py
+++ b/config.py
@ -1,45 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Wed Jun 21 13:58:42 2023
@author: michael
 '''
 ## Setup directories
 # WD Michael
 wd = '/home/michael/Documents/PS/Data/collectTweets/'
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # Tweet-datafile output directory
 td = 'data/tweets/'
 # Name of file that all tweets will be written to
 file_alltweets = 'ALL-SENATORS-TWEETS.csv'
 path_to_tweetdfs = wd + td
 ## Define Timespan 
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = '2020-01-01T00:00:00Z' # start of scraping
 ts_end = '2023-01-03T00:00:00Z' # end of straping
 no_slices = 24 # Number of slices / time periods.
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
 # Name of logfile
 logfile = 'log/log_'
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
 ''' 
 import subprocess
 os.chdir('snscrape/')
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
 '''
--- a/funs/TimeSlice.py
+++ b/funs/TimeSlice.py
@ -21,4 +21,12 @@ def get_Tslices(ts_beg, ts_end, no_slices):
                'end_time': (ts_beg + ts_dif * i + ts_dif - timedelta(microseconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ'),
                'suffix': f'-slice{i+1}'
            })
-    return time_slices
+    return time_slices
 # For log time conversions (seconds to days, hours, minutes)
 def convertTime(duration):
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 60)
    return hours, minutes, seconds