comments and reorders

2023-06-07 20:36:35 +02:00 · 2023-06-07 20:36:35 +02:00 · 81db25a8b8
commit 81db25a8b8
parent 0bc42fa862
1 changed files with 99 additions and 99 deletions
--- a/collect.py
+++ b/collect.py
@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
+'''
 Created on Tue Jun  6 11:40:07 2023

@author: michael
-"""
+'''

 import os
 import tweepy
@ -15,48 +15,48 @@ import time

 ## Setup directories
 # WD Michael
-wd = "/home/michael/Documents/PS/Data/collectTweets/"
+wd = '/home/michael/Documents/PS/Data/collectTweets/'

 # WD Server
-# wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection"
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection'

 # WD Josie
-# wd = "/home/michael/Documents/PS/Data/"
+# wd = '/home/michael/Documents/PS/Data/'

 # WD Sam
-# wd = "/home/michael/Documents/PS/Data/"
+# wd = '/home/michael/Documents/PS/Data/'

 # Tweet-datafile directory
-td = "data/tweets/"
+td = 'data/tweets/'

 os.chdir(wd)

 ## Setup Api-connection
-bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc"
+bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc'
 client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)

 # Define time period of interest
 # Define time periods of interest
 time_slices = [
    {
-        "start_time": "2020-01-01T00:00:00Z",
-        "end_time": "2020-06-01T00:00:00Z",
-        "suffix": "-slice1"
+        'start_time': '2020-01-01T00:00:00Z',
+        'end_time': '2020-06-01T00:00:00Z',
+        'suffix': '-slice1'
    },
    {
-        "start_time": "2020-06-01T00:00:01Z",
-        "end_time": "2021-01-01T00:00:00Z",
-        "suffix": "-slice2"
+        'start_time': '2020-06-01T00:00:01Z',
+        'end_time': '2021-01-01T00:00:00Z',
+        'suffix': '-slice2'
    },
    {
-        "start_time": "2021-01-01T00:00:01Z",
-        "end_time": "2021-06-01T00:00:00Z",
-        "suffix": "-slice3"
+        'start_time': '2021-01-01T00:00:01Z',
+        'end_time': '2021-06-01T00:00:00Z',
+        'suffix': '-slice3'
    },
    {
-        "start_time": "2021-06-01T00:00:01Z",
-        "end_time": "2023-01-03T00:00:00Z",
-        "suffix": "-slice4"
+        'start_time': '2021-06-01T00:00:01Z',
+        'end_time': '2023-01-03T00:00:00Z',
+        'suffix': '-slice4'
    }
 ]

@ -66,95 +66,87 @@ time_slices = [
 keywords = []

 # Read the keywords from a file
-with open("data/keywords.txt", "r") as file:
+with open('data/keywords.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

 tweet_fields = [
-    "id",
-    "text",
-    "attachments",
-    "author_id",
-    "context_annotations",
-    "conversation_id",
-    "created_at",
-    "entities",
-    "geo",
-    "lang",
-    "possibly_sensitive",
-    "public_metrics",
-    "referenced_tweets",
-    "reply_settings",
-    "source",
-    "withheld",
+    'id',
+    'text',
+    'attachments',
+    'author_id',
+    'context_annotations',
+    'conversation_id',
+    'created_at',
+    'entities',
+    'geo',
+    'lang',
+    'possibly_sensitive',
+    'public_metrics',
+    'referenced_tweets',
+    'reply_settings',
+    'source',
+    'withheld',
    ]

 # Get accounts & alt-accounts from Senators-Datafile
-accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
-alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
+accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist()
+alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist()
 print(accounts)
 print(alt_accounts)

 # Iterate over each Twitter account
 for handle in accounts:
    for slice_data in time_slices:
-        start_time = slice_data["start_time"]
-        end_time = slice_data["end_time"]
-        suffix = slice_data["suffix"]
+        # define slice data variables from time_slices
+        start_time = slice_data['start_time']
+        end_time = slice_data['end_time']
+        suffix = slice_data['suffix']
        
-        query = "from:" + handle + " -is:retweet"
+        # define tweepy query with twitter handle of current sen
+        query = f'from:{handle} -is:retweet'
        
+        # create empty tweetlist that will be filled with tweets of current sen
        tweetlist = []
-        # Fetch tweets using Twitter API pagination
-        try:
-            for tweet in tweepy.Paginator(client.search_all_tweets,
-                                          query=query,
-                                          tweet_fields=tweet_fields,
-                                          start_time=start_time,
-                                          end_time=end_time,
-                                          max_results=100).flatten(50):
-                tweetlist.append(tweet)
-            msg = f"trying to fetch tweets for {handle}{suffix} fetched"
-            print(msg)
-        except tweepy.error.TweepError as ex:
-            timestamp = datetime.now().timestamp()
-            msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..."
-            print(msg)
-            time.sleep(1)
-            try:
-                for tweet in tweepy.Paginator(client.search_all_tweets,
-                                          query=query,
-                                          tweet_fields=tweet_fields,
-                                          start_time=start_time,
-                                          end_time=end_time,
-                                          max_results=100).flatten(50):
-                    tweetlist.append(tweet)
-                msg = f"2nd try: tweets for {handle}{suffix} successfully fetched"
-                print(msg)
-            except tweepy.error.TweepError as ex:
-                timestamp = datetime.now().timestamp()
-                msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..."
-                print(msg)
-                time.sleep(1)
        
-        all_tweets = pd.DataFrame(tweetlist)
+        # statusmsg
+        msg = f'trying to fetch tweets for {handle}{suffix}'
+        print(msg)
        
-        # Check if no tweets fetched for the current time slice
+        # Fetch tweets using tweepy Twitter API v2 pagination
+        tweets = tweepy.Paginator(client.search_all_tweets,
+                                      query=query,
+                                      tweet_fields=tweet_fields,
+                                      start_time=start_time,
+                                      end_time=end_time,
+                                      max_results=20).flatten(20)
+        
+        # for each tweet returned...
+        for tweet in tweets:
+            # ... add that tweet to tweetlist
+            tweetlist.append(tweet)
+        
+        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration
        if len(tweetlist) == 0:
-            msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}"
+            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'
            print(msg)
            continue
        
-        all_tweets['handle'] = handle
+        # convert to dataframe
+        tweet_df = pd.DataFrame(tweetlist)
        
-        # Extract referenced_tweet info from column
-        all_tweets['referenced_tweet_type'] = None
-        all_tweets['referenced_tweet_id'] = None
+        # add handle column as api only provides user-ids
+        tweet_df['handle'] = handle
        
-        if 'referenced_tweets' in all_tweets.columns:
-            for index, row in all_tweets.iterrows():
+        ## Extract referenced_tweet info from column
+        tweet_df['referenced_tweet_type'] = None
+        tweet_df['referenced_tweet_id'] = None
+        
+        # if cond. because in some cases column doesn't exist
+        if 'referenced_tweets' in tweet_df.columns:
+            for index, row in tweet_df.iterrows():
                referenced_tweets = row['referenced_tweets']
                
                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:
@ -162,28 +154,36 @@ for handle in accounts:
                    referenced_tweet_type = referenced_tweet['type']
                    referenced_tweet_id = referenced_tweet['id']
                    
-                    all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type
-                    all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id
+                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type
+                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id
        
-        # Check if tweet contains keyword
-        if 'text' in all_tweets.columns:
-            all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords))
+        ## Check if tweet-text contains keyword
+        # if cond. because in some cases column doesn't exist
+        if 'text' in tweet_df.columns:
+            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))
                                              .str.join(',')
                                              .replace('', 'none'))
        
-        # Save two versions of the dataset, one with all fields and one without dict fields
-        csv_path = f"data/tweets/{handle}{suffix}.csv"
-        csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv"
-        all_tweets.to_csv(csv_path2)
-        all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1)
-        all_tweets.to_csv(csv_path)
-        time.sleep(1) # sleep 1 second to not get over api limit
+        ## Save two versions of the dataset, one with all fields and one without dict fields
+        # define filepaths
+        csv_path = f'data/tweets/{handle}{suffix}.csv'
+        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'
+        # save LONG csv
+        tweet_df.to_csv(csv_path2)
+        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files
+        # if cond. because in some cases column doesn't exist
+        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):
+            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)
+        # save short csv
+        tweet_df.to_csv(csv_path)
+        # sleep 1 second to not get over 1sec api limit
+        time.sleep(1) 

 # Merge CSV-Files
 # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful)
 path_to_tweetdfs = wd + td
 os.chdir(path_to_tweetdfs)
-tweetfiles = glob.glob('*.{}'.format("csv"))
+tweetfiles = glob.glob('*.{}'.format('csv'))

 print(tweetfiles)

@ -191,14 +191,14 @@ print(tweetfiles)
 df_all_senators = pd.DataFrame()
 df_all_senators_long = pd.DataFrame()
 for file in tweetfiles:
-    if "LONG" in file:
+    if 'LONG' in file:
        df = pd.read_csv(file)
        df_all_senators_long = pd.concat([df, df_all_senators_long])
    else:
        df = pd.read_csv(file)
        df_all_senators = pd.concat([df, df_all_senators])
-csv_path = td + "ALL-SENATORS.csv"
-csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv"
+csv_path = td + 'ALL-SENATORS.csv'
+csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv'
 df_all_senators.to_csv(csv_path)    
 df_all_senators_long.to_csv(csv_path2)