adds senator data scraper

This commit is contained in:
Michael Beck
2023-06-23 23:53:31 +02:00
parent 90d5501ec8
commit 71e10a62d3
2 changed files with 218 additions and 1 deletions

View File

@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
# save short csv
tweet_df.to_csv(csv_path, encoding='utf-8')
# sleep 1 second to not get blocked because of excessive requests
time.sleep(0.5)
time.sleep(0.5)
def getHandles(di):
"""grabs accounts from senators-raw.csv
Args:
di (str): path to senators-raw.csv
Returns:
list: list containing str of senator account handles
"""
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
accounts.extend(alt_accounts)
return accounts
def printHandles(accounts):
"""returns string with all accounts in a readable way.
Args:
accounts (list): list of str with handles
Returns:
str: containing text that can be written to txtfile
"""
txt = ["Accounts to be scraped:\n"]
for i, acc in enumerate(accounts): # print 5 accounts per line
txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
if i % 5 == 4:
txt.append(" \n")
txt.append(f"\n{i} accounts in total.")
return ''.join(txt)
def scrapeUsers(handle, userDFColumns, maxTweets=1):
currentTime = datetime.now()
userList = []
print(f'{currentTime:<30} Fetching: {handle:>15}')
query = f'from:{handle}'
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
if i > maxTweets:
break
# Get user data and append to singleUserList
userList = []
for col in userDFColumns:
singleUser = eval(f'tweet.user.{col}')
userList.append(singleUser)
# Create dataframe using userList and userDFColumns
#df = pd.DataFrame(userList, columns=userDFColumns)
return userList