comments and reorders
This commit is contained in:
		
							
								
								
									
										198
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										198
									
								
								collect.py
									
									
									
									
									
								
							| @@ -1,10 +1,10 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # -*- coding: utf-8 -*- | ||||
| """ | ||||
| ''' | ||||
| Created on Tue Jun  6 11:40:07 2023 | ||||
|  | ||||
| @author: michael | ||||
| """ | ||||
| ''' | ||||
|  | ||||
| import os | ||||
| import tweepy | ||||
| @@ -15,48 +15,48 @@ import time | ||||
|  | ||||
| ## Setup directories | ||||
| # WD Michael | ||||
| wd = "/home/michael/Documents/PS/Data/collectTweets/" | ||||
| wd = '/home/michael/Documents/PS/Data/collectTweets/' | ||||
|  | ||||
| # WD Server | ||||
| # wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" | ||||
| # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection' | ||||
|  | ||||
| # WD Josie | ||||
| # wd = "/home/michael/Documents/PS/Data/" | ||||
| # wd = '/home/michael/Documents/PS/Data/' | ||||
|  | ||||
| # WD Sam | ||||
| # wd = "/home/michael/Documents/PS/Data/" | ||||
| # wd = '/home/michael/Documents/PS/Data/' | ||||
|  | ||||
| # Tweet-datafile directory | ||||
| td = "data/tweets/" | ||||
| td = 'data/tweets/' | ||||
|  | ||||
| os.chdir(wd) | ||||
|  | ||||
| ## Setup Api-connection | ||||
| bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" | ||||
| bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc' | ||||
| client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) | ||||
|  | ||||
| # Define time period of interest | ||||
| # Define time periods of interest | ||||
| time_slices = [ | ||||
|     { | ||||
|         "start_time": "2020-01-01T00:00:00Z", | ||||
|         "end_time": "2020-06-01T00:00:00Z", | ||||
|         "suffix": "-slice1" | ||||
|         'start_time': '2020-01-01T00:00:00Z', | ||||
|         'end_time': '2020-06-01T00:00:00Z', | ||||
|         'suffix': '-slice1' | ||||
|     }, | ||||
|     { | ||||
|         "start_time": "2020-06-01T00:00:01Z", | ||||
|         "end_time": "2021-01-01T00:00:00Z", | ||||
|         "suffix": "-slice2" | ||||
|         'start_time': '2020-06-01T00:00:01Z', | ||||
|         'end_time': '2021-01-01T00:00:00Z', | ||||
|         'suffix': '-slice2' | ||||
|     }, | ||||
|     { | ||||
|         "start_time": "2021-01-01T00:00:01Z", | ||||
|         "end_time": "2021-06-01T00:00:00Z", | ||||
|         "suffix": "-slice3" | ||||
|         'start_time': '2021-01-01T00:00:01Z', | ||||
|         'end_time': '2021-06-01T00:00:00Z', | ||||
|         'suffix': '-slice3' | ||||
|     }, | ||||
|     { | ||||
|         "start_time": "2021-06-01T00:00:01Z", | ||||
|         "end_time": "2023-01-03T00:00:00Z", | ||||
|         "suffix": "-slice4" | ||||
|         'start_time': '2021-06-01T00:00:01Z', | ||||
|         'end_time': '2023-01-03T00:00:00Z', | ||||
|         'suffix': '-slice4' | ||||
|     } | ||||
| ] | ||||
|  | ||||
| @@ -66,95 +66,87 @@ time_slices = [ | ||||
| keywords = [] | ||||
|  | ||||
| # Read the keywords from a file | ||||
| with open("data/keywords.txt", "r") as file: | ||||
| with open('data/keywords.txt', 'r') as file: | ||||
|     lines = file.readlines() | ||||
|     for line in lines: | ||||
|         keyword = line.strip()  # Remove the newline character | ||||
|         keywords.append(keyword) | ||||
|  | ||||
| tweet_fields = [ | ||||
|     "id", | ||||
|     "text", | ||||
|     "attachments", | ||||
|     "author_id", | ||||
|     "context_annotations", | ||||
|     "conversation_id", | ||||
|     "created_at", | ||||
|     "entities", | ||||
|     "geo", | ||||
|     "lang", | ||||
|     "possibly_sensitive", | ||||
|     "public_metrics", | ||||
|     "referenced_tweets", | ||||
|     "reply_settings", | ||||
|     "source", | ||||
|     "withheld", | ||||
|     'id', | ||||
|     'text', | ||||
|     'attachments', | ||||
|     'author_id', | ||||
|     'context_annotations', | ||||
|     'conversation_id', | ||||
|     'created_at', | ||||
|     'entities', | ||||
|     'geo', | ||||
|     'lang', | ||||
|     'possibly_sensitive', | ||||
|     'public_metrics', | ||||
|     'referenced_tweets', | ||||
|     'reply_settings', | ||||
|     'source', | ||||
|     'withheld', | ||||
|     ] | ||||
|  | ||||
| # Get accounts & alt-accounts from Senators-Datafile | ||||
| accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() | ||||
| alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() | ||||
| accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() | ||||
| alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() | ||||
| print(accounts) | ||||
| print(alt_accounts) | ||||
|  | ||||
| # Iterate over each Twitter account | ||||
| for handle in accounts: | ||||
|     for slice_data in time_slices: | ||||
|         start_time = slice_data["start_time"] | ||||
|         end_time = slice_data["end_time"] | ||||
|         suffix = slice_data["suffix"] | ||||
|         # define slice data variables from time_slices | ||||
|         start_time = slice_data['start_time'] | ||||
|         end_time = slice_data['end_time'] | ||||
|         suffix = slice_data['suffix'] | ||||
|          | ||||
|         query = "from:" + handle + " -is:retweet" | ||||
|         # define tweepy query with twitter handle of current sen | ||||
|         query = f'from:{handle} -is:retweet' | ||||
|          | ||||
|         # create empty tweetlist that will be filled with tweets of current sen | ||||
|         tweetlist = [] | ||||
|         # Fetch tweets using Twitter API pagination | ||||
|         try: | ||||
|             for tweet in tweepy.Paginator(client.search_all_tweets, | ||||
|                                           query=query, | ||||
|                                           tweet_fields=tweet_fields, | ||||
|                                           start_time=start_time, | ||||
|                                           end_time=end_time, | ||||
|                                           max_results=100).flatten(50): | ||||
|                 tweetlist.append(tweet) | ||||
|             msg = f"trying to fetch tweets for {handle}{suffix} fetched" | ||||
|             print(msg) | ||||
|         except tweepy.error.TweepError as ex: | ||||
|             timestamp = datetime.now().timestamp() | ||||
|             msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..." | ||||
|             print(msg) | ||||
|             time.sleep(1) | ||||
|             try: | ||||
|                 for tweet in tweepy.Paginator(client.search_all_tweets, | ||||
|                                           query=query, | ||||
|                                           tweet_fields=tweet_fields, | ||||
|                                           start_time=start_time, | ||||
|                                           end_time=end_time, | ||||
|                                           max_results=100).flatten(50): | ||||
|                     tweetlist.append(tweet) | ||||
|                 msg = f"2nd try: tweets for {handle}{suffix} successfully fetched" | ||||
|                 print(msg) | ||||
|             except tweepy.error.TweepError as ex: | ||||
|                 timestamp = datetime.now().timestamp() | ||||
|                 msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..." | ||||
|                 print(msg) | ||||
|                 time.sleep(1) | ||||
|          | ||||
|         all_tweets = pd.DataFrame(tweetlist) | ||||
|         # statusmsg | ||||
|         msg = f'trying to fetch tweets for {handle}{suffix}' | ||||
|         print(msg) | ||||
|          | ||||
|         # Check if no tweets fetched for the current time slice | ||||
|         # Fetch tweets using tweepy Twitter API v2 pagination | ||||
|         tweets = tweepy.Paginator(client.search_all_tweets, | ||||
|                                       query=query, | ||||
|                                       tweet_fields=tweet_fields, | ||||
|                                       start_time=start_time, | ||||
|                                       end_time=end_time, | ||||
|                                       max_results=20).flatten(20) | ||||
|          | ||||
|         # for each tweet returned... | ||||
|         for tweet in tweets: | ||||
|             # ... add that tweet to tweetlist | ||||
|             tweetlist.append(tweet) | ||||
|          | ||||
|         # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration | ||||
|         if len(tweetlist) == 0: | ||||
|             msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" | ||||
|             msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}' | ||||
|             print(msg) | ||||
|             continue | ||||
|          | ||||
|         all_tweets['handle'] = handle | ||||
|         # convert to dataframe | ||||
|         tweet_df = pd.DataFrame(tweetlist) | ||||
|          | ||||
|         # Extract referenced_tweet info from column | ||||
|         all_tweets['referenced_tweet_type'] = None | ||||
|         all_tweets['referenced_tweet_id'] = None | ||||
|         # add handle column as api only provides user-ids | ||||
|         tweet_df['handle'] = handle | ||||
|          | ||||
|         if 'referenced_tweets' in all_tweets.columns: | ||||
|             for index, row in all_tweets.iterrows(): | ||||
|         ## Extract referenced_tweet info from column | ||||
|         tweet_df['referenced_tweet_type'] = None | ||||
|         tweet_df['referenced_tweet_id'] = None | ||||
|          | ||||
|         # if cond. because in some cases column doesn't exist | ||||
|         if 'referenced_tweets' in tweet_df.columns: | ||||
|             for index, row in tweet_df.iterrows(): | ||||
|                 referenced_tweets = row['referenced_tweets'] | ||||
|                  | ||||
|                 if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: | ||||
| @@ -162,28 +154,36 @@ for handle in accounts: | ||||
|                     referenced_tweet_type = referenced_tweet['type'] | ||||
|                     referenced_tweet_id = referenced_tweet['id'] | ||||
|                      | ||||
|                     all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type | ||||
|                     all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id | ||||
|                     tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type | ||||
|                     tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id | ||||
|          | ||||
|         # Check if tweet contains keyword | ||||
|         if 'text' in all_tweets.columns: | ||||
|             all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) | ||||
|         ## Check if tweet-text contains keyword | ||||
|         # if cond. because in some cases column doesn't exist | ||||
|         if 'text' in tweet_df.columns: | ||||
|             tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords)) | ||||
|                                               .str.join(',') | ||||
|                                               .replace('', 'none')) | ||||
|          | ||||
|         # Save two versions of the dataset, one with all fields and one without dict fields | ||||
|         csv_path = f"data/tweets/{handle}{suffix}.csv" | ||||
|         csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" | ||||
|         all_tweets.to_csv(csv_path2) | ||||
|         all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) | ||||
|         all_tweets.to_csv(csv_path) | ||||
|         time.sleep(1) # sleep 1 second to not get over api limit | ||||
|         ## Save two versions of the dataset, one with all fields and one without dict fields | ||||
|         # define filepaths | ||||
|         csv_path = f'data/tweets/{handle}{suffix}.csv' | ||||
|         csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv' | ||||
|         # save LONG csv | ||||
|         tweet_df.to_csv(csv_path2) | ||||
|         # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files | ||||
|         # if cond. because in some cases column doesn't exist | ||||
|         if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')): | ||||
|             tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1) | ||||
|         # save short csv | ||||
|         tweet_df.to_csv(csv_path) | ||||
|         # sleep 1 second to not get over 1sec api limit | ||||
|         time.sleep(1)  | ||||
|  | ||||
| # Merge CSV-Files | ||||
| # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) | ||||
| path_to_tweetdfs = wd + td | ||||
| os.chdir(path_to_tweetdfs) | ||||
| tweetfiles = glob.glob('*.{}'.format("csv")) | ||||
| tweetfiles = glob.glob('*.{}'.format('csv')) | ||||
|  | ||||
| print(tweetfiles) | ||||
|  | ||||
| @@ -191,14 +191,14 @@ print(tweetfiles) | ||||
| df_all_senators = pd.DataFrame() | ||||
| df_all_senators_long = pd.DataFrame() | ||||
| for file in tweetfiles: | ||||
|     if "LONG" in file: | ||||
|     if 'LONG' in file: | ||||
|         df = pd.read_csv(file) | ||||
|         df_all_senators_long = pd.concat([df, df_all_senators_long]) | ||||
|     else: | ||||
|         df = pd.read_csv(file) | ||||
|         df_all_senators = pd.concat([df, df_all_senators]) | ||||
| csv_path = td + "ALL-SENATORS.csv" | ||||
| csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" | ||||
| csv_path = td + 'ALL-SENATORS.csv' | ||||
| csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv' | ||||
| df_all_senators.to_csv(csv_path)     | ||||
| df_all_senators_long.to_csv(csv_path2) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck