Restructures. adds TimeSlice, ClearDupes and more comments.

2023-06-21 19:07:07 +02:00
parent 2e70d960a5
commit ea7fcc732e
7 changed files with 539 additions and 325 deletions
--- a/collect.py
+++ b/collect.py
@@ -1,70 +1,79 @@
-#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
-Created on Tue Jun  6 11:40:07 2023
+Created on Thu Jun  8 01:08:21 2023

-@author: michael
+@author: Michael
+
+Following files are necessary:
+    config.py
+        Used to configure everything that's needed for this script.
+    funs/TimeSlice.py
+        Function get_Tslices slices the defined timespan in config.py into N 
+        slices. Is necessary due to possible blocking of requests by twitter. 
+        The script will slepp for 1 second after each slice that was scraped.
+    funs/ClearDupes.py
+        Function deDupe reads each line of inFile and removes duplicate lines.
+        A file outFile is saved without the duplicate lines. Generates 
+        "keywords.txt".
+    data/keywords-raw.txt
+        Contains all keywords that are used to detect whether a tweet contains
+        information about Covid19.
+    data/senators-raw.csv
+        Contains the senator dataset converted to csv. Is used to get the 
+        account-names of all senators twitter accounts.
+
+Requirements:
+    - snscrape 0.6.2.20230321+
+    - pandas 2.0+
+The script will first import needed libraries. 
+This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
+included in 'snscrape/' as a git repository for better reproducibility. Earlier
+versions of snscrape will most likely fail to scrape all tweets because of 
+certain rate limits or other errors that may occur.
+config.py will check whether snscrape is already installed. If not, it will try
+to install the included version automatically.
+
+How to use:
+- To run the script, first adjust the config.py file. 
+- config.py will check whether snscrape is already installed. If not, it will try
+to install the included version automatically. 
+- run the script
+- The whole script is expected to run without error messages except the 
+following:
+    'Stopping after 20 empty pages': indicates that no more tweets were found and
+        that the script skips to the next slice/account.
+    'return empty in {twitter-handle}-sliceX - from XX to XX': no tweets were 
+        found in that specific time range for that specific twitter account.
+
+The script will scrape tweets for all senators in 'data/senators-raw.csv'
+sliced in 6 time periods (to bypass twitters limitations). It will check whether
+a tweet contains any of the keywords in 'data/keywords.txt' and add an indicator 
+in the datafile. It will then join all slices and create 'ALL-SENATORS.csv' 
+which is the final output.
 '''

 import os
-import tweepy
 import pandas as pd
-import numpy as np
 import glob
 import time

-## Setup directories
-# WD Michael
-wd = '/home/michael/Documents/PS/Data/collectTweets/'
+## Import other files
+from config import *
+import snscrape.modules.twitter as sntwitter
+from funs.TimeSlice import get_Tslices
+from funs.ClearDupes import deDupe

-# WD Server
-# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'
+## Create List of time-period-slices
+time_slices = get_Tslices(ts_beg, ts_end, no_slices)
+# Print slices
+print('Time-period-slices:')
+for slice in time_slices:
+    print(slice['suffix'] + ': ' + slice['beg_time'] + ' - ' + slice['end_time'])

-# WD Josie
-# wd = '/home/michael/Documents/PS/Data/'
-
-# WD Sam
-# wd = '/home/michael/Documents/PS/Data/'
-
-# Tweet-datafile directory
-td = 'data/tweets/'
-
-os.chdir(wd)
-
-## Setup Api-connection
-bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
-client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)
-
-# Define time period of interest
-# Define time periods of interest
-time_slices = [
-    {
-        'start_time': '2020-01-01T00:00:00Z',
-        'end_time': '2020-06-01T00:00:00Z',
-        'suffix': '-slice1'
-    },
-    {
-        'start_time': '2020-06-01T00:00:01Z',
-        'end_time': '2021-01-01T00:00:00Z',
-        'suffix': '-slice2'
-    },
-    {
-        'start_time': '2021-01-01T00:00:01Z',
-        'end_time': '2021-06-01T00:00:00Z',
-        'suffix': '-slice3'
-    },
-    {
-        'start_time': '2021-06-01T00:00:01Z',
-        'end_time': '2023-01-03T00:00:00Z',
-        'suffix': '-slice4'
-    }
-]
-
-# gather keywords @chenTrackingSocialMedia2020
-# line80 ff:  lamsalCoronavirusCOVID19Tweets2020
-# Initialize the keywords list
+## Keywords
 keywords = []
-
+# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
+deDupe('data/keywords-raw.txt', 'data/keywords.txt')
 # Read the keywords from a file
 with open('data/keywords.txt', 'r') as file:
    lines = file.readlines()
@@ -72,42 +81,21 @@ with open('data/keywords.txt', 'r') as file:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

-tweet_fields = [
-    'id',
-    'text',
-    'attachments',
-    'author_id',
-    'context_annotations',
-    'conversation_id',
-    'created_at',
-    'entities',
-    'geo',
-    'lang',
-    'possibly_sensitive',
-    'public_metrics',
-    'referenced_tweets',
-    'reply_settings',
-    'source',
-    'withheld',
-    ]
-
+## Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
 alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
-print(accounts)
-print(alt_accounts)

+## Scraping
 # Iterate over each Twitter account
 for handle in accounts:
+    # Iterate over each time slice
    for slice_data in time_slices:
        # define slice data variables from time_slices
-        start_time = slice_data['start_time']
-        end_time = slice_data['end_time']
+        ts_beg = slice_data['beg_time']
+        ts_end = slice_data['end_time']
        suffix = slice_data['suffix']
        
-        # define tweepy query with twitter handle of current sen
-        query = f'from:{handle} -is:retweet'
-        
        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []
        
@@ -115,121 +103,117 @@ for handle in accounts:
        msg = f'trying to fetch tweets for {handle}{suffix}'
        print(msg)
        
-        # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism
-        max_attempts = 3  # maximum number of attempts to fetch tweets for a slice
-        attempt = 1
-        
-        while attempt <= max_attempts:
-            try:
-                tweets = tweepy.Paginator(client.search_all_tweets,
-                                          query=query,
-                                          tweet_fields=tweet_fields,
-                                          start_time=start_time,
-                                          end_time=end_time,
-                                          max_results=20).flatten(20)
-                
-                # for each tweet returned...
-                for tweet in tweets:
-                    # ... add that tweet to tweetlist
-                    tweetlist.append(tweet)
-                
-                break  # exit the retry loop if tweets are successfully fetched
-            
-            except tweepy.TweepError as e:
-                # handle rate limit exceeded error
-                if e.response.status_code == 429:
-                    # get the rate limit reset time from the response headers
-                    reset_time = int(e.response.headers['x-rate-limit-reset'])
-                    current_time = int(time.time())
-                    
-                    # calculate the sleep time until the rate limit resets
-                    sleep_time = reset_time - current_time + 1  # add an extra second
-                    
-                    # sleep until the rate limit resets
-                    time.sleep(sleep_time)
-                    
-                    attempt += 1  # increment the attempt counter
-                    continue  # retry the API call
-                
-                else:
-                    # handle other types of Tweepy errors
-                    print(f'Error occurred: {e}')
-                    break
-        
+        # Snscrape query:
+        query = f'from:{handle} since:{ts_beg} until:{ts_end}'
+        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
+            if i>maxTweets:
+                break
+            tweetlist.append([
+                tweet.id,
+                tweet.user.id,
+                tweet.user.username,
+                tweet.user.verified,
+                tweet.user.created,
+                tweet.user.favouritesCount,
+                tweet.user.followersCount,
+                tweet.user.friendsCount,
+                tweet.user.url,
+                tweet.rawContent,
+                tweet.renderedContent,
+                tweet.cashtags,
+                tweet.coordinates,
+                tweet.hashtags,
+                tweet.inReplyToTweetId,
+                tweet.inReplyToUser,
+                tweet.media,
+                tweet.mentionedUsers,
+                tweet.links,
+                tweet.place,
+                tweet.quotedTweet,
+                tweet.retweetedTweet,
+                tweet.sourceLabel,
+                tweet.sourceUrl,
+                tweet.url,
+                tweet.date,
+                tweet.replyCount,
+                tweet.retweetCount,
+                tweet.likeCount,
+                tweet.quoteCount,
+                tweet.conversationId,
+                tweet.lang,
+                tweet.source
+            ])
        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(tweetlist) == 0:
-            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
+            msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}'
            print(msg)
            continue
        
        # convert to dataframe
-        tweet_df = pd.DataFrame(tweetlist)
-        
-        # add handle column as api only provides user-ids
-        tweet_df['handle'] = handle
-        
-        ## Extract referenced_tweet info from column
-        tweet_df['referenced_tweet_type'] = None
-        tweet_df['referenced_tweet_id'] = None
-        
-        # if cond. because in some cases column doesn't exist
-        if 'referenced_tweets' in tweet_df.columns:
-            for index, row in tweet_df.iterrows():
-                referenced_tweets = row['referenced_tweets']
-                
-                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
-                    referenced_tweet = referenced_tweets[0]
-                    referenced_tweet_type = referenced_tweet['type']
-                    referenced_tweet_id = referenced_tweet['id']
-                    
-                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
-                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
+        tweet_df = pd.DataFrame(tweetlist, columns=[
+            'id', 
+            'user.id', 
+            'user.username',
+            'user.verified',
+            'user.created',
+            'user.favouritesCount',
+            'user.followersCount',
+            'user.friendsCount',
+            'user.url',
+            'rawContent', 
+            'renderedContent', 
+            'cashtags', 
+            'coordinates', 
+            'hashtags', 
+            'inReplyToTweetId', 
+            'inReplyToUser', 
+            'media', 
+            'mentionedUsers', 
+            'links', 
+            'place', 
+            'quotedTweet', 
+            'retweetedTweet', 
+            'sourceLabel', 
+            'sourceUrl', 
+            'url', 
+            'date', 
+            'replyCount', 
+            'retweetCount', 
+            'likeCount', 
+            'quoteCount', 
+            'conversationId', 
+            'lang', 
+            'source'])
        
        ## Check if tweet-text contains keyword
-        # if cond. because in some cases column doesn't exist
-        if 'text' in tweet_df.columns:
-            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
+        tweet_df['contains_keyword'] = ''
+        tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
-        
        ## Save two versions of the dataset, one with all fields and one without dict fields
        # define filepaths
-        csv_path = f'data/tweets/{handle}{suffix}.csv'
-        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
-        # save LONG csv
-        tweet_df.to_csv(csv_path2)
-        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
-        # if cond. because in some cases column doesn't exist
-        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
-            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
+        csv_path = f'data/tweets/T{handle}{suffix}.csv'
        # save short csv
        tweet_df.to_csv(csv_path)
-        
-        # sleep 1 second to not exceed the API rate limit
+        # sleep 1 second to not get blocked because of excessive requests
        time.sleep(1)

-
-# Merge CSV-Files
-# (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
-path_to_tweetdfs = wd + td
+## Merge CSV-Files to file_alltweets
+# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
 os.chdir(path_to_tweetdfs)
-tweetfiles = glob.glob('*.{}'.format('csv'))
-
-print(tweetfiles)
-
-# save merged csv as two files 
-df_all_senators = pd.DataFrame()
-df_all_senators_long = pd.DataFrame()
-for file in tweetfiles:
-    if 'LONG' in file:
-        df = pd.read_csv(file)
-        df_all_senators_long = pd.concat([df, df_all_senators_long])
-    else:
-        df = pd.read_csv(file)
-        df_all_senators = pd.concat([df, df_all_senators])
-csv_path = td + 'ALL-SENATORS.csv'
-csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
-df_all_senators.to_csv(csv_path)    
-df_all_senators_long.to_csv(csv_path2)
+tweetfiles = glob.glob('*.{}'.format('csv')) # get list of all csv files in folder
+# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
+if file_alltweets in tweetfiles:
+    tweetfiles.remove(file_alltweets)

+# Go through all csv files and merge them into file_alltweets
+with open(file_alltweets,"wb") as fout:
+    # first file (because of the header):
+    with open(tweetfiles[0], "rb") as f:
+        fout.write(f.read())
+    # other files without the header:    
+    for file in tweetfiles[1:]:
+        with open(file, "rb") as f:
+            next(f) # skip the header
+            fout.write(f.read())
 os.chdir(wd)