comments and reorders
This commit is contained in:
		
							
								
								
									
										198
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										198
									
								
								collect.py
									
									
									
									
									
								
							| @@ -1,10 +1,10 @@ | |||||||
| #!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
| """ | ''' | ||||||
| Created on Tue Jun  6 11:40:07 2023 | Created on Tue Jun  6 11:40:07 2023 | ||||||
|  |  | ||||||
| @author: michael | @author: michael | ||||||
| """ | ''' | ||||||
|  |  | ||||||
| import os | import os | ||||||
| import tweepy | import tweepy | ||||||
| @@ -15,48 +15,48 @@ import time | |||||||
|  |  | ||||||
| ## Setup directories | ## Setup directories | ||||||
| # WD Michael | # WD Michael | ||||||
| wd = "/home/michael/Documents/PS/Data/collectTweets/" | wd = '/home/michael/Documents/PS/Data/collectTweets/' | ||||||
|  |  | ||||||
| # WD Server | # WD Server | ||||||
| # wd = "/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection" | # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection' | ||||||
|  |  | ||||||
| # WD Josie | # WD Josie | ||||||
| # wd = "/home/michael/Documents/PS/Data/" | # wd = '/home/michael/Documents/PS/Data/' | ||||||
|  |  | ||||||
| # WD Sam | # WD Sam | ||||||
| # wd = "/home/michael/Documents/PS/Data/" | # wd = '/home/michael/Documents/PS/Data/' | ||||||
|  |  | ||||||
| # Tweet-datafile directory | # Tweet-datafile directory | ||||||
| td = "data/tweets/" | td = 'data/tweets/' | ||||||
|  |  | ||||||
| os.chdir(wd) | os.chdir(wd) | ||||||
|  |  | ||||||
| ## Setup Api-connection | ## Setup Api-connection | ||||||
| bearer_token = "AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc" | bearer_token = 'AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc' | ||||||
| client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) | client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True) | ||||||
|  |  | ||||||
| # Define time period of interest | # Define time period of interest | ||||||
| # Define time periods of interest | # Define time periods of interest | ||||||
| time_slices = [ | time_slices = [ | ||||||
|     { |     { | ||||||
|         "start_time": "2020-01-01T00:00:00Z", |         'start_time': '2020-01-01T00:00:00Z', | ||||||
|         "end_time": "2020-06-01T00:00:00Z", |         'end_time': '2020-06-01T00:00:00Z', | ||||||
|         "suffix": "-slice1" |         'suffix': '-slice1' | ||||||
|     }, |     }, | ||||||
|     { |     { | ||||||
|         "start_time": "2020-06-01T00:00:01Z", |         'start_time': '2020-06-01T00:00:01Z', | ||||||
|         "end_time": "2021-01-01T00:00:00Z", |         'end_time': '2021-01-01T00:00:00Z', | ||||||
|         "suffix": "-slice2" |         'suffix': '-slice2' | ||||||
|     }, |     }, | ||||||
|     { |     { | ||||||
|         "start_time": "2021-01-01T00:00:01Z", |         'start_time': '2021-01-01T00:00:01Z', | ||||||
|         "end_time": "2021-06-01T00:00:00Z", |         'end_time': '2021-06-01T00:00:00Z', | ||||||
|         "suffix": "-slice3" |         'suffix': '-slice3' | ||||||
|     }, |     }, | ||||||
|     { |     { | ||||||
|         "start_time": "2021-06-01T00:00:01Z", |         'start_time': '2021-06-01T00:00:01Z', | ||||||
|         "end_time": "2023-01-03T00:00:00Z", |         'end_time': '2023-01-03T00:00:00Z', | ||||||
|         "suffix": "-slice4" |         'suffix': '-slice4' | ||||||
|     } |     } | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -66,95 +66,87 @@ time_slices = [ | |||||||
| keywords = [] | keywords = [] | ||||||
|  |  | ||||||
| # Read the keywords from a file | # Read the keywords from a file | ||||||
| with open("data/keywords.txt", "r") as file: | with open('data/keywords.txt', 'r') as file: | ||||||
|     lines = file.readlines() |     lines = file.readlines() | ||||||
|     for line in lines: |     for line in lines: | ||||||
|         keyword = line.strip()  # Remove the newline character |         keyword = line.strip()  # Remove the newline character | ||||||
|         keywords.append(keyword) |         keywords.append(keyword) | ||||||
|  |  | ||||||
| tweet_fields = [ | tweet_fields = [ | ||||||
|     "id", |     'id', | ||||||
|     "text", |     'text', | ||||||
|     "attachments", |     'attachments', | ||||||
|     "author_id", |     'author_id', | ||||||
|     "context_annotations", |     'context_annotations', | ||||||
|     "conversation_id", |     'conversation_id', | ||||||
|     "created_at", |     'created_at', | ||||||
|     "entities", |     'entities', | ||||||
|     "geo", |     'geo', | ||||||
|     "lang", |     'lang', | ||||||
|     "possibly_sensitive", |     'possibly_sensitive', | ||||||
|     "public_metrics", |     'public_metrics', | ||||||
|     "referenced_tweets", |     'referenced_tweets', | ||||||
|     "reply_settings", |     'reply_settings', | ||||||
|     "source", |     'source', | ||||||
|     "withheld", |     'withheld', | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
| # Get accounts & alt-accounts from Senators-Datafile | # Get accounts & alt-accounts from Senators-Datafile | ||||||
| accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist() | accounts = pd.read_csv('data/senators-raw.csv')['twitter_handle'].tolist() | ||||||
| alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist() | alt_accounts = pd.read_csv('data/senators-raw.csv')['alt_handle'].tolist() | ||||||
| print(accounts) | print(accounts) | ||||||
| print(alt_accounts) | print(alt_accounts) | ||||||
|  |  | ||||||
| # Iterate over each Twitter account | # Iterate over each Twitter account | ||||||
| for handle in accounts: | for handle in accounts: | ||||||
|     for slice_data in time_slices: |     for slice_data in time_slices: | ||||||
|         start_time = slice_data["start_time"] |         # define slice data variables from time_slices | ||||||
|         end_time = slice_data["end_time"] |         start_time = slice_data['start_time'] | ||||||
|         suffix = slice_data["suffix"] |         end_time = slice_data['end_time'] | ||||||
|  |         suffix = slice_data['suffix'] | ||||||
|          |          | ||||||
|         query = "from:" + handle + " -is:retweet" |         # define tweepy query with twitter handle of current sen | ||||||
|  |         query = f'from:{handle} -is:retweet' | ||||||
|          |          | ||||||
|  |         # create empty tweetlist that will be filled with tweets of current sen | ||||||
|         tweetlist = [] |         tweetlist = [] | ||||||
|         # Fetch tweets using Twitter API pagination |  | ||||||
|         try: |  | ||||||
|             for tweet in tweepy.Paginator(client.search_all_tweets, |  | ||||||
|                                           query=query, |  | ||||||
|                                           tweet_fields=tweet_fields, |  | ||||||
|                                           start_time=start_time, |  | ||||||
|                                           end_time=end_time, |  | ||||||
|                                           max_results=100).flatten(50): |  | ||||||
|                 tweetlist.append(tweet) |  | ||||||
|             msg = f"trying to fetch tweets for {handle}{suffix} fetched" |  | ||||||
|             print(msg) |  | ||||||
|         except tweepy.error.TweepError as ex: |  | ||||||
|             timestamp = datetime.now().timestamp() |  | ||||||
|             msg = f"{timestamp} - raised exception {handle}{suffix}: " + str(ex) + " - sleeping..." |  | ||||||
|             print(msg) |  | ||||||
|             time.sleep(1) |  | ||||||
|             try: |  | ||||||
|                 for tweet in tweepy.Paginator(client.search_all_tweets, |  | ||||||
|                                           query=query, |  | ||||||
|                                           tweet_fields=tweet_fields, |  | ||||||
|                                           start_time=start_time, |  | ||||||
|                                           end_time=end_time, |  | ||||||
|                                           max_results=100).flatten(50): |  | ||||||
|                     tweetlist.append(tweet) |  | ||||||
|                 msg = f"2nd try: tweets for {handle}{suffix} successfully fetched" |  | ||||||
|                 print(msg) |  | ||||||
|             except tweepy.error.TweepError as ex: |  | ||||||
|                 timestamp = datetime.now().timestamp() |  | ||||||
|                 msg = f"{timestamp} - raised exception AGAIN {handle}{suffix}: " + str(ex) + " - sleeping..." |  | ||||||
|                 print(msg) |  | ||||||
|                 time.sleep(1) |  | ||||||
|          |          | ||||||
|         all_tweets = pd.DataFrame(tweetlist) |         # statusmsg | ||||||
|  |         msg = f'trying to fetch tweets for {handle}{suffix}' | ||||||
|  |         print(msg) | ||||||
|          |          | ||||||
|         # Check if no tweets fetched for the current time slice |         # Fetch tweets using tweepy Twitter API v2 pagination | ||||||
|  |         tweets = tweepy.Paginator(client.search_all_tweets, | ||||||
|  |                                       query=query, | ||||||
|  |                                       tweet_fields=tweet_fields, | ||||||
|  |                                       start_time=start_time, | ||||||
|  |                                       end_time=end_time, | ||||||
|  |                                       max_results=20).flatten(20) | ||||||
|  |          | ||||||
|  |         # for each tweet returned... | ||||||
|  |         for tweet in tweets: | ||||||
|  |             # ... add that tweet to tweetlist | ||||||
|  |             tweetlist.append(tweet) | ||||||
|  |          | ||||||
|  |         # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration | ||||||
|         if len(tweetlist) == 0: |         if len(tweetlist) == 0: | ||||||
|             msg = f"return empty in {handle}{suffix} - from {start_time} to {end_time}" |             msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}' | ||||||
|             print(msg) |             print(msg) | ||||||
|             continue |             continue | ||||||
|          |          | ||||||
|         all_tweets['handle'] = handle |         # convert to dataframe | ||||||
|  |         tweet_df = pd.DataFrame(tweetlist) | ||||||
|          |          | ||||||
|         # Extract referenced_tweet info from column |         # add handle column as api only provides user-ids | ||||||
|         all_tweets['referenced_tweet_type'] = None |         tweet_df['handle'] = handle | ||||||
|         all_tweets['referenced_tweet_id'] = None |  | ||||||
|          |          | ||||||
|         if 'referenced_tweets' in all_tweets.columns: |         ## Extract referenced_tweet info from column | ||||||
|             for index, row in all_tweets.iterrows(): |         tweet_df['referenced_tweet_type'] = None | ||||||
|  |         tweet_df['referenced_tweet_id'] = None | ||||||
|  |          | ||||||
|  |         # if cond. because in some cases column doesn't exist | ||||||
|  |         if 'referenced_tweets' in tweet_df.columns: | ||||||
|  |             for index, row in tweet_df.iterrows(): | ||||||
|                 referenced_tweets = row['referenced_tweets'] |                 referenced_tweets = row['referenced_tweets'] | ||||||
|                  |                  | ||||||
|                 if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: |                 if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0: | ||||||
| @@ -162,28 +154,36 @@ for handle in accounts: | |||||||
|                     referenced_tweet_type = referenced_tweet['type'] |                     referenced_tweet_type = referenced_tweet['type'] | ||||||
|                     referenced_tweet_id = referenced_tweet['id'] |                     referenced_tweet_id = referenced_tweet['id'] | ||||||
|                      |                      | ||||||
|                     all_tweets.at[index, 'referenced_tweet_type'] = referenced_tweet_type |                     tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type | ||||||
|                     all_tweets.at[index, 'referenced_tweet_id'] = referenced_tweet_id |                     tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id | ||||||
|          |          | ||||||
|         # Check if tweet contains keyword |         ## Check if tweet-text contains keyword | ||||||
|         if 'text' in all_tweets.columns: |         # if cond. because in some cases column doesn't exist | ||||||
|             all_tweets['contains_keyword'] = (all_tweets['text'].str.findall('|'.join(keywords)) |         if 'text' in tweet_df.columns: | ||||||
|  |             tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords)) | ||||||
|                                               .str.join(',') |                                               .str.join(',') | ||||||
|                                               .replace('', 'none')) |                                               .replace('', 'none')) | ||||||
|          |          | ||||||
|         # Save two versions of the dataset, one with all fields and one without dict fields |         ## Save two versions of the dataset, one with all fields and one without dict fields | ||||||
|         csv_path = f"data/tweets/{handle}{suffix}.csv" |         # define filepaths | ||||||
|         csv_path2 = f"data/tweets/{handle}{suffix}-LONG.csv" |         csv_path = f'data/tweets/{handle}{suffix}.csv' | ||||||
|         all_tweets.to_csv(csv_path2) |         csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv' | ||||||
|         all_tweets = all_tweets.drop(["context_annotations", "entities", "referenced_tweets"], axis=1) |         # save LONG csv | ||||||
|         all_tweets.to_csv(csv_path) |         tweet_df.to_csv(csv_path2) | ||||||
|         time.sleep(1) # sleep 1 second to not get over api limit |         # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files | ||||||
|  |         # if cond. because in some cases column doesn't exist | ||||||
|  |         if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')): | ||||||
|  |             tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1) | ||||||
|  |         # save short csv | ||||||
|  |         tweet_df.to_csv(csv_path) | ||||||
|  |         # sleep 1 second to not get over 1sec api limit | ||||||
|  |         time.sleep(1)  | ||||||
|  |  | ||||||
| # Merge CSV-Files | # Merge CSV-Files | ||||||
| # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) | # (it would also have been a possibility to build a dataframe with all senators' tweets but i found the other way around more useful) | ||||||
| path_to_tweetdfs = wd + td | path_to_tweetdfs = wd + td | ||||||
| os.chdir(path_to_tweetdfs) | os.chdir(path_to_tweetdfs) | ||||||
| tweetfiles = glob.glob('*.{}'.format("csv")) | tweetfiles = glob.glob('*.{}'.format('csv')) | ||||||
|  |  | ||||||
| print(tweetfiles) | print(tweetfiles) | ||||||
|  |  | ||||||
| @@ -191,14 +191,14 @@ print(tweetfiles) | |||||||
| df_all_senators = pd.DataFrame() | df_all_senators = pd.DataFrame() | ||||||
| df_all_senators_long = pd.DataFrame() | df_all_senators_long = pd.DataFrame() | ||||||
| for file in tweetfiles: | for file in tweetfiles: | ||||||
|     if "LONG" in file: |     if 'LONG' in file: | ||||||
|         df = pd.read_csv(file) |         df = pd.read_csv(file) | ||||||
|         df_all_senators_long = pd.concat([df, df_all_senators_long]) |         df_all_senators_long = pd.concat([df, df_all_senators_long]) | ||||||
|     else: |     else: | ||||||
|         df = pd.read_csv(file) |         df = pd.read_csv(file) | ||||||
|         df_all_senators = pd.concat([df, df_all_senators]) |         df_all_senators = pd.concat([df, df_all_senators]) | ||||||
| csv_path = td + "ALL-SENATORS.csv" | csv_path = td + 'ALL-SENATORS.csv' | ||||||
| csv_path2 = td + "ALL-SENATORS-LONG-LONG.csv" | csv_path2 = td + 'ALL-SENATORS-LONG-LONG.csv' | ||||||
| df_all_senators.to_csv(csv_path)     | df_all_senators.to_csv(csv_path)     | ||||||
| df_all_senators_long.to_csv(csv_path2) | df_all_senators_long.to_csv(csv_path2) | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck