adds

2023-06-23 15:57:31 +02:00
parent 599202ae4d
commit 88c016a2a6
4 changed files with 101 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+**/log*
+**/*-slice*.csv
 /ALL-SENATORS-LONG.csv
 /ALL-SENATORS.csv
 /collect2.py
--- a/collect.py
+++ b/collect.py
@@ -59,8 +59,79 @@ import time
 import sys
 from datetime import datetime

+## Setup directories
+# WD Michael
+wd = '/home/michael/Documents/PS/Data/collectTweets/'
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# Tweet-datafile output directory
+td = 'data/tweets/'
+
+# Name of file that all tweets will be written to
+file_alltweets = 'ALL-SENATORS-TWEETS.csv'
+
+path_to_tweetdfs = wd + td
+
+## Define Timespan 
+# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+ts_beg = '2020-01-01T00:00:00Z' # start of scraping
+ts_end = '2023-01-03T00:00:00Z' # end of straping
+no_slices = 24 # Number of slices / time periods.
+
+# Maximum tweets to be scraped by snscrape. Can be left untouched.
+maxTweets = 5000
+
+# Name of logfile
+logfile = 'log/log_'
+
+
+## Install snscrape from local git repo to make shure that it fits the used version.
+# If snscrape is already installed, uncomment the following lines:
+''' 
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+'''
+
+# Columns for tweet dataframe
+tweetDFColumns = [
+            'id', 
+            'user.id', 
+            'user.username',
+            'user.verified',
+            'user.created',
+            'user.favouritesCount',
+            'user.followersCount',
+            'user.friendsCount',
+            'user.url',
+            'rawContent', 
+            'renderedContent', 
+            'cashtags', 
+            'coordinates', 
+            'hashtags', 
+            'inReplyToTweetId', 
+            'inReplyToUser', 
+            'media', 
+            'mentionedUsers', 
+            'links', 
+            'place', 
+            'quotedTweet', 
+            'retweetedTweet', 
+            'sourceLabel', 
+            'sourceUrl', 
+            'url', 
+            'date', 
+            'replyCount', 
+            'retweetCount', 
+            'likeCount', 
+            'quoteCount', 
+            'conversationId', 
+            'lang', 
+            'source']
+
 ## Import other files
-from config import *
 import snscrape.modules.twitter as sntwitter
 from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
@@ -115,110 +186,49 @@ for handle in accounts:
        ts_beg = slice_data['beg_time']
        ts_end = slice_data['end_time']
        suffix = slice_data['suffix']
+        tweetFileName = "Tweets-{handle}{suffix}.csv"
        
        # create empty tweetlist that will be filled with tweets of current sen
-        tweetlist = []
+        TweetList = []
        
        # statusmsg
-        msg = f'trying to fetch tweets for {handle}{suffix}'
-        print(msg)
+        print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}')
        
        # Snscrape query:
        query = f'from:{handle} since:{ts_beg} until:{ts_end}'
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
+            singleTweetList = []
            if i>maxTweets:
                break
-            tweetlist.append([
-                tweet.id,
-                tweet.user.id,
-                tweet.user.username,
-                tweet.user.verified,
-                tweet.user.created,
-                tweet.user.favouritesCount,
-                tweet.user.followersCount,
-                tweet.user.friendsCount,
-                tweet.user.url,
-                tweet.rawContent,
-                tweet.renderedContent,
-                tweet.cashtags,
-                tweet.coordinates,
-                tweet.hashtags,
-                tweet.inReplyToTweetId,
-                tweet.inReplyToUser,
-                tweet.media,
-                tweet.mentionedUsers,
-                tweet.links,
-                tweet.place,
-                tweet.quotedTweet,
-                tweet.retweetedTweet,
-                tweet.sourceLabel,
-                tweet.sourceUrl,
-                tweet.url,
-                tweet.date,
-                tweet.replyCount,
-                tweet.retweetCount,
-                tweet.likeCount,
-                tweet.quoteCount,
-                tweet.conversationId,
-                tweet.lang,
-                tweet.source
-            ])
+            # get tweet vars from tweetDFColumns and append to singleTweetList
+            # which will then be appended to TweetList. TweetList contains all tweets of the current slice.
+            for col in tweetDFColumns:
+                singleTweetList.append(eval(f'tweet.{col}')) 
+            TweetList.append(singleTweetList)
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
-        if len(tweetlist) == 0:
+        if len(TweetList) == 0:
            msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
+            open(file, 'a').close()
            print(msg)
            continue
        
+        print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}')
+        
        # convert to dataframe
-        tweet_df = pd.DataFrame(tweetlist, columns=[
-            'id', 
-            'user.id', 
-            'user.username',
-            'user.verified',
-            'user.created',
-            'user.favouritesCount',
-            'user.followersCount',
-            'user.friendsCount',
-            'user.url',
-            'rawContent', 
-            'renderedContent', 
-            'cashtags', 
-            'coordinates', 
-            'hashtags', 
-            'inReplyToTweetId', 
-            'inReplyToUser', 
-            'media', 
-            'mentionedUsers', 
-            'links', 
-            'place', 
-            'quotedTweet', 
-            'retweetedTweet', 
-            'sourceLabel', 
-            'sourceUrl', 
-            'url', 
-            'date', 
-            'replyCount', 
-            'retweetCount', 
-            'likeCount', 
-            'quoteCount', 
-            'conversationId', 
-            'lang', 
-            'source'])
+        tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns)
        
        ## Check if tweet-text contains keyword
        tweet_df['contains_keyword'] = ''
-        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
-                                              .str.join(',')
-                                              .replace('', 'none'))
+        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none'))
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
-        csv_path = f'data/tweets/T{handle}{suffix}.csv'
+        csv_path = td + tweetFileName
        # save short csv
        tweet_df.to_csv(csv_path)
        # sleep 1 second to not get blocked because of excessive requests
-        time.sleep(1)
+        time.sleep(0.5)

-timeEndScrape = datetime.now()
+timeEndScrape = datetime.now()tweetFileName
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S'))
--- a/config.py
+++ b/config.py
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Wed Jun 21 13:58:42 2023
-
-@author: michael
-'''
-
-## Setup directories
-# WD Michael
-wd = '/home/michael/Documents/PS/Data/collectTweets/'
-# WD Server
-# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
-
-# Tweet-datafile output directory
-td = 'data/tweets/'
-
-# Name of file that all tweets will be written to
-file_alltweets = 'ALL-SENATORS-TWEETS.csv'
-
-path_to_tweetdfs = wd + td
-
-## Define Timespan 
-# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
-ts_beg = '2020-01-01T00:00:00Z' # start of scraping
-ts_end = '2023-01-03T00:00:00Z' # end of straping
-no_slices = 24 # Number of slices / time periods.
-
-# Maximum tweets to be scraped by snscrape. Can be left untouched.
-maxTweets = 5000
-
-# Name of logfile
-logfile = 'log/log_'
-
-
-## Install snscrape from local git repo to make shure that it fits the used version.
-# If snscrape is already installed, uncomment the following lines:
-''' 
-import subprocess
-os.chdir('snscrape/')
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
-os.chdir(wd) 
-'''
-
-
--- a/funs/TimeSlice.py
+++ b/funs/TimeSlice.py
@@ -21,4 +21,12 @@ def get_Tslices(ts_beg, ts_end, no_slices):
                'end_time': (ts_beg + ts_dif * i + ts_dif - timedelta(microseconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ'),
                'suffix': f'-slice{i+1}'
            })
-    return time_slices
+    return time_slices
+
+# For log time conversions (seconds to days, hours, minutes)
+def convertTime(duration):
+    days, seconds = duration.days, duration.seconds
+    hours = days * 24 + seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds = (seconds % 60)
+    return hours, minutes, seconds