adds
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,3 +1,5 @@ | ||||
| **/log* | ||||
| **/*-slice*.csv | ||||
| /ALL-SENATORS-LONG.csv | ||||
| /ALL-SENATORS.csv | ||||
| /collect2.py | ||||
|   | ||||
							
								
								
									
										170
									
								
								collect.py
									
									
									
									
									
								
							
							
						
						
									
										170
									
								
								collect.py
									
									
									
									
									
								
							| @@ -59,8 +59,79 @@ import time | ||||
| import sys | ||||
| from datetime import datetime | ||||
|  | ||||
| ## Setup directories | ||||
| # WD Michael | ||||
| wd = '/home/michael/Documents/PS/Data/collectTweets/' | ||||
| # WD Server | ||||
| # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' | ||||
|  | ||||
| # Tweet-datafile output directory | ||||
| td = 'data/tweets/' | ||||
|  | ||||
| # Name of file that all tweets will be written to | ||||
| file_alltweets = 'ALL-SENATORS-TWEETS.csv' | ||||
|  | ||||
| path_to_tweetdfs = wd + td | ||||
|  | ||||
| ## Define Timespan  | ||||
| # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) | ||||
| ts_beg = '2020-01-01T00:00:00Z' # start of scraping | ||||
| ts_end = '2023-01-03T00:00:00Z' # end of straping | ||||
| no_slices = 24 # Number of slices / time periods. | ||||
|  | ||||
| # Maximum tweets to be scraped by snscrape. Can be left untouched. | ||||
| maxTweets = 5000 | ||||
|  | ||||
| # Name of logfile | ||||
| logfile = 'log/log_' | ||||
|  | ||||
|  | ||||
| ## Install snscrape from local git repo to make shure that it fits the used version. | ||||
| # If snscrape is already installed, uncomment the following lines: | ||||
| '''  | ||||
| import subprocess | ||||
| os.chdir('snscrape/') | ||||
| subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) | ||||
| os.chdir(wd)  | ||||
| ''' | ||||
|  | ||||
| # Columns for tweet dataframe | ||||
| tweetDFColumns = [ | ||||
|             'id',  | ||||
|             'user.id',  | ||||
|             'user.username', | ||||
|             'user.verified', | ||||
|             'user.created', | ||||
|             'user.favouritesCount', | ||||
|             'user.followersCount', | ||||
|             'user.friendsCount', | ||||
|             'user.url', | ||||
|             'rawContent',  | ||||
|             'renderedContent',  | ||||
|             'cashtags',  | ||||
|             'coordinates',  | ||||
|             'hashtags',  | ||||
|             'inReplyToTweetId',  | ||||
|             'inReplyToUser',  | ||||
|             'media',  | ||||
|             'mentionedUsers',  | ||||
|             'links',  | ||||
|             'place',  | ||||
|             'quotedTweet',  | ||||
|             'retweetedTweet',  | ||||
|             'sourceLabel',  | ||||
|             'sourceUrl',  | ||||
|             'url',  | ||||
|             'date',  | ||||
|             'replyCount',  | ||||
|             'retweetCount',  | ||||
|             'likeCount',  | ||||
|             'quoteCount',  | ||||
|             'conversationId',  | ||||
|             'lang',  | ||||
|             'source'] | ||||
|  | ||||
| ## Import other files | ||||
| from config import * | ||||
| import snscrape.modules.twitter as sntwitter | ||||
| from funs.TimeSlice import * | ||||
| from funs.ClearDupes import deDupe | ||||
| @@ -115,110 +186,49 @@ for handle in accounts: | ||||
|         ts_beg = slice_data['beg_time'] | ||||
|         ts_end = slice_data['end_time'] | ||||
|         suffix = slice_data['suffix'] | ||||
|         tweetFileName = "Tweets-{handle}{suffix}.csv" | ||||
|          | ||||
|         # create empty tweetlist that will be filled with tweets of current sen | ||||
|         tweetlist = [] | ||||
|         TweetList = [] | ||||
|          | ||||
|         # statusmsg | ||||
|         msg = f'trying to fetch tweets for {handle}{suffix}' | ||||
|         print(msg) | ||||
|         print(f'Fetching: {handle:>15}{suffix:<7} - from {ts_beg} to {ts_end}') | ||||
|          | ||||
|         # Snscrape query: | ||||
|         query = f'from:{handle} since:{ts_beg} until:{ts_end}' | ||||
|         for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()): | ||||
|             singleTweetList = [] | ||||
|             if i>maxTweets: | ||||
|                 break | ||||
|             tweetlist.append([ | ||||
|                 tweet.id, | ||||
|                 tweet.user.id, | ||||
|                 tweet.user.username, | ||||
|                 tweet.user.verified, | ||||
|                 tweet.user.created, | ||||
|                 tweet.user.favouritesCount, | ||||
|                 tweet.user.followersCount, | ||||
|                 tweet.user.friendsCount, | ||||
|                 tweet.user.url, | ||||
|                 tweet.rawContent, | ||||
|                 tweet.renderedContent, | ||||
|                 tweet.cashtags, | ||||
|                 tweet.coordinates, | ||||
|                 tweet.hashtags, | ||||
|                 tweet.inReplyToTweetId, | ||||
|                 tweet.inReplyToUser, | ||||
|                 tweet.media, | ||||
|                 tweet.mentionedUsers, | ||||
|                 tweet.links, | ||||
|                 tweet.place, | ||||
|                 tweet.quotedTweet, | ||||
|                 tweet.retweetedTweet, | ||||
|                 tweet.sourceLabel, | ||||
|                 tweet.sourceUrl, | ||||
|                 tweet.url, | ||||
|                 tweet.date, | ||||
|                 tweet.replyCount, | ||||
|                 tweet.retweetCount, | ||||
|                 tweet.likeCount, | ||||
|                 tweet.quoteCount, | ||||
|                 tweet.conversationId, | ||||
|                 tweet.lang, | ||||
|                 tweet.source | ||||
|             ]) | ||||
|             # get tweet vars from tweetDFColumns and append to singleTweetList | ||||
|             # which will then be appended to TweetList. TweetList contains all tweets of the current slice. | ||||
|             for col in tweetDFColumns: | ||||
|                 singleTweetList.append(eval(f'tweet.{col}'))  | ||||
|             TweetList.append(singleTweetList) | ||||
|         # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration | ||||
|         if len(tweetlist) == 0: | ||||
|         if len(TweetList) == 0: | ||||
|             msg = f'return empty in {handle}{suffix} - from {ts_beg} to {ts_end}' | ||||
|             open(file, 'a').close() | ||||
|             print(msg) | ||||
|             continue | ||||
|          | ||||
|         print(f'{i:<6} tweets scraped for: {handle:>15}{suffix:<7}') | ||||
|          | ||||
|         # convert to dataframe | ||||
|         tweet_df = pd.DataFrame(tweetlist, columns=[ | ||||
|             'id',  | ||||
|             'user.id',  | ||||
|             'user.username', | ||||
|             'user.verified', | ||||
|             'user.created', | ||||
|             'user.favouritesCount', | ||||
|             'user.followersCount', | ||||
|             'user.friendsCount', | ||||
|             'user.url', | ||||
|             'rawContent',  | ||||
|             'renderedContent',  | ||||
|             'cashtags',  | ||||
|             'coordinates',  | ||||
|             'hashtags',  | ||||
|             'inReplyToTweetId',  | ||||
|             'inReplyToUser',  | ||||
|             'media',  | ||||
|             'mentionedUsers',  | ||||
|             'links',  | ||||
|             'place',  | ||||
|             'quotedTweet',  | ||||
|             'retweetedTweet',  | ||||
|             'sourceLabel',  | ||||
|             'sourceUrl',  | ||||
|             'url',  | ||||
|             'date',  | ||||
|             'replyCount',  | ||||
|             'retweetCount',  | ||||
|             'likeCount',  | ||||
|             'quoteCount',  | ||||
|             'conversationId',  | ||||
|             'lang',  | ||||
|             'source']) | ||||
|         tweet_df = pd.DataFrame(TweetList, columns=tweetDFColumns) | ||||
|          | ||||
|         ## Check if tweet-text contains keyword | ||||
|         tweet_df['contains_keyword'] = '' | ||||
|         tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)) | ||||
|                                               .str.join(',') | ||||
|                                               .replace('', 'none')) | ||||
|         tweet_df['contains_keyword'] = (tweet_df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')) | ||||
|         ## Save two versions of the dataset, one with all fields and one without dict fields | ||||
|         # define filepaths | ||||
|         csv_path = f'data/tweets/T{handle}{suffix}.csv' | ||||
|         csv_path = td + tweetFileName | ||||
|         # save short csv | ||||
|         tweet_df.to_csv(csv_path) | ||||
|         # sleep 1 second to not get blocked because of excessive requests | ||||
|         time.sleep(1) | ||||
|         time.sleep(0.5) | ||||
|  | ||||
| timeEndScrape = datetime.now() | ||||
| timeEndScrape = datetime.now()tweetFileName | ||||
| print("---") | ||||
| print("End of scraping at:") | ||||
| print(timeEndScrape.strftime('%Y-%m-%d_%H-%M-%S')) | ||||
|   | ||||
							
								
								
									
										45
									
								
								config.py
									
									
									
									
									
								
							
							
						
						
									
										45
									
								
								config.py
									
									
									
									
									
								
							| @@ -1,45 +0,0 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # -*- coding: utf-8 -*- | ||||
| ''' | ||||
| Created on Wed Jun 21 13:58:42 2023 | ||||
|  | ||||
| @author: michael | ||||
| ''' | ||||
|  | ||||
| ## Setup directories | ||||
| # WD Michael | ||||
| wd = '/home/michael/Documents/PS/Data/collectTweets/' | ||||
| # WD Server | ||||
| # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' | ||||
|  | ||||
| # Tweet-datafile output directory | ||||
| td = 'data/tweets/' | ||||
|  | ||||
| # Name of file that all tweets will be written to | ||||
| file_alltweets = 'ALL-SENATORS-TWEETS.csv' | ||||
|  | ||||
| path_to_tweetdfs = wd + td | ||||
|  | ||||
| ## Define Timespan  | ||||
| # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) | ||||
| ts_beg = '2020-01-01T00:00:00Z' # start of scraping | ||||
| ts_end = '2023-01-03T00:00:00Z' # end of straping | ||||
| no_slices = 24 # Number of slices / time periods. | ||||
|  | ||||
| # Maximum tweets to be scraped by snscrape. Can be left untouched. | ||||
| maxTweets = 5000 | ||||
|  | ||||
| # Name of logfile | ||||
| logfile = 'log/log_' | ||||
|  | ||||
|  | ||||
| ## Install snscrape from local git repo to make shure that it fits the used version. | ||||
| # If snscrape is already installed, uncomment the following lines: | ||||
| '''  | ||||
| import subprocess | ||||
| os.chdir('snscrape/') | ||||
| subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.']) | ||||
| os.chdir(wd)  | ||||
| ''' | ||||
|  | ||||
|  | ||||
| @@ -21,4 +21,12 @@ def get_Tslices(ts_beg, ts_end, no_slices): | ||||
|                 'end_time': (ts_beg + ts_dif * i + ts_dif - timedelta(microseconds=1)).strftime('%Y-%m-%dT%H:%M:%SZ'), | ||||
|                 'suffix': f'-slice{i+1}' | ||||
|             }) | ||||
|     return time_slices | ||||
|     return time_slices | ||||
|  | ||||
| # For log time conversions (seconds to days, hours, minutes) | ||||
| def convertTime(duration): | ||||
|     days, seconds = duration.days, duration.seconds | ||||
|     hours = days * 24 + seconds // 3600 | ||||
|     minutes = (seconds % 3600) // 60 | ||||
|     seconds = (seconds % 60) | ||||
|     return hours, minutes, seconds | ||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck