adds link to full package to readme

adds html files to gitignore
data/OUT/profiles/CovTweets.html gelöscht
2023-08-31 01:23:38 +02:00 · 2023-08-31 01:21:31 +02:00 · 2023-08-31 01:20:39 +02:00 · 2023-08-31 01:20:31 +02:00 · 2023-08-30 21:54:13 +02:00 · 2023-08-30 21:53:05 +02:00
36 changed files with 3046 additions and 1203 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,9 @@
-**/log*
+**/*.log
 **/*lock*
 **/*-slice*.csv
 **/*.zip
+**/*.html
+**/*.htm
 /ALL-SENATORS-LONG.csv
 /ALL-SENATORS.csv
 /collect2.py
--- a/.vscode/.gitignore
+++ b/.vscode/.gitignore
@ -0,0 +1 @@
+/settings.json
--- a/ClassificationFake.py
+++ b/ClassificationFake.py
@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "Tweets-Classified-Topic-Results.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+def encode_labels(label):
+    if label == 'True':
+        return 'False'
+    elif label == 'False':
+        return 'True'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
+
+dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True'] 
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label_fake'] = output_labels
+dfClassify['output_score_fake'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%
--- a/ClassificationTopic.py
+++ b/ClassificationTopic.py
@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label_topicCov'] = output_labels
+dfClassify['output_score_topicCov'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%
+## corrections
+def encode_labels(label):
+    if label == 'real':
+        return 'True'
+    elif label == 'fake':
+        return 'False'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+#still wrong, will be corrected in ClassificationFake.py
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,131 @@
+# Requirements
+
+- python 3.10+
+- snscrape 0.6.2.20230321+ (see git repo in this folder)
+- transformers 4.31.0
+- numpy 1.23.5
+- pandas 2.0.3
+- scikit-learn 1.3.0
+- torch 2.0.1
+
+# About
+
+This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
+Training only works with a prepared dataset in which the tweets are pre classified.
+More info in the comments of the scripts.
+Due to time constraints, most of the code is procedurally coded and ugly but effective.
+
+# How to
+
+Tested on Ubuntu 22.04. 
+If needed, the virual environment can be exported and send to you.
+
+All files in the folder data/in have to exist in order to execute the scripts.
+Execute in the following order:
+
+01 collect.py (see more for further info on scraping)
+02 collectSenData.py
+03 cleanTweets
+04 preTestClassification.py
+05 trainTopic.py
+06 trainFake.py
+07 ClassificationFake.py
+08 ClassificationTopic.py
+
+# Files & Folders
+
+Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
+
+```
+├── data
+│   ├── IN
+│   │   ├── counterKeywordsFinal.txt
+│   │   ├── counterKeywords.txt
+│   │   ├── keywords-raw.txt
+│   │   ├── keywords.txt
+│   │   ├── own_keywords.txt
+│   │   ├── pretest-tweets_fake.txt				contains tweet ids for pretest
+│   │   ├── pretest-tweets_not_fake.txt			contains tweet ids for pretest
+│   │   └── senators-raw.csv					senator datafile
+│   ├── OUT
+│   │   ├── ALL-SENATORS-TWEETS.csv
+│   │   ├── graphs
+│   │   │   ├── Timeline.png
+│   │   │   ├── Wordcloud-All.png
+│   │   │   └── Wordcloud-Cov.png
+│   │   ├── Pretest-Prep.csv
+│   │   ├── Pretest-Results.csv
+│   │   ├── Pretest-SENATORS-TWEETS.csv
+│   │   ├── profiles							dataset profiles
+│   │   │   ├── AllTweets.html
+│   │   │   └── CovTweets.html
+│   │   ├── SenatorsTweets-Final.csv
+│   │   ├── SenatorsTweets-OnlyCov.csv
+│   │   ├── SenatorsTweets-train-CovClassification.csv
+│   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv
+│   │   ├── SenatorsTweets-train-CovClassification.tsv
+│   │   ├── SenatorsTweets-train-FakeClassification.csv
+│   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
+│   │   ├── SenatorsTweets-train-FakeClassification.tsv
+│   │   ├── SenatorsTweets-Training.csv
+│   │   ├── SenatorsTweets-Training_WORKING-COPY.csv
+│   │   ├── topClass-PRETEST-Prep.csv
+│   │   ├── topClass-PRETEST-Results.csv
+│   │   ├── Tweets-All-slices.zip
+│   │   ├── Tweets-Classified-Fake-Prep.csv
+│   │   ├── Tweets-Classified-Fake-Results.csv
+│   │   ├── Tweets-Classified-Prep.csv
+│   │   ├── Tweets-Classified-Topic-Prep.csv
+│   │   ├── Tweets-Classified-Topic-Results.csv
+│   │   └── Tweets-Stub.csv
+├── funs
+│   ├── CleanTweets.py					2023-01-03T00:00:00Z		multiple functions to clean tweet contents for NLN-processing
+│   ├── ClearDupes.py							function for deletion of duplicate keywords
+│   ├── __init__.py
+│   ├── Scrape.py								scraper functions to be used for multiprocessing
+│   └── TimeSlice.py							time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
+├── log											logs of the scraping process
+│   ├── log_2023-06-23_21-06-10_err.log
+│   ├── log_2023-06-23_21-06-10.log
+│   └── log_2023-06-23_21-06-10_missing.log
+├── models
+│   ├── CovClass								Covid tweet classification model
+│   │   └── 2023-08-15_05-56-50
+│   │       ├── 2023-08-15_05-56-50.csv			training output
+│   │       ├── config.json
+│   │       ├── pytorch_model.bin
+│   │       ├── special_tokens_map.json
+│   │       ├── tokenizer_config.json
+│   │       ├── tokenizer.json
+│   │       └── vocab.txt
+│   └── FakeClass								Fake tweet classification model
+│       └── 2023-08-15_14-35-43
+│           ├── 2023-08-15_14-35-43.csv			training output
+│           ├── config.json
+│           ├── pytorch_model.bin
+│           ├── special_tokens_map.json
+│           ├── tokenizer_config.json
+│           ├── tokenizer.json
+│           └── vocab.txt
+├── snscrape									contains snscrape 0.6.2.20230321+ git repo
+├── ClassificationFake.py						classifies tweets as fake or non-fake, saves:
+│													Tweets-Classified-Fake-Prep.csv		- prepared training dataset
+│													Tweets-Classified-Fake-Results.csv	- Tweets-Classified-Topic-Results.csv with cov classification results
+├── ClassificationTopic.py						classifies tweet topic, saves: 
+│													Tweets-Classified-Topic-Prep.csv 	- prepared training dataset
+│													Tweets-Classified-Topic-Results.csv	- SenatorsTweets-OnlyCov.csv with cov classification results
+├── cleanTweets.py								Curates keywordlists 
+│												Merges senator and tweet datasets
+│												Creates multiple datasets:
+│													SenatorsTweets-Final.csv	- all tweets with keyword columns
+│													SenatorsTweets-OnlyCov.csv	- only covid tweets, filtered by keywordlist
+│													SenatorsTweets-Training.csv	- training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
+├── collect.py									scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
+├── collectSenData.py							scrapes senator account data, saves to ALL-SENATORS.csv
+├── createGraphs.py								creates wordcloud & timeline graphs
+├── preTestClassification.py					pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
+├── profiler.py									creates dataset profiles
+├── README.md									readme
+├── trainFake.py								training script for the fake tweet classification model
+└── trainTopic.py								training script for the tweet topic classification model
+```
--- a/cleanTweets.py
+++ b/cleanTweets.py
@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 26 20:36:43 2023
+
+@author: michael
+"""
+
+import pandas as pd
+# import pyreadstat
+import numpy as np
+import sys
+
+
+# Seet for training dataset generation
+seed = 86431891
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final"
+senCSVcCov = "SenatorsTweets-OnlyCov"
+senCSVcTrain = "SenatorsTweets-Training"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc + ".csv"
+senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
+senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
+senSAVcPath = wd + ud + senCSV + ".sav"
+senDTAcPath = wd + ud + senCSV + ".dta"
+senDatasetPath = wd + di + senDataset
+
+df = pd.read_csv(senCSVPath, dtype=(object))
+
+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from ClearDupes import deDupe
+
+mixed_columns = df.columns[df.nunique() != len(df)]
+print(mixed_columns)
+
+df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
+del df[df.columns[0]] # remove first col
+
+df['user.created'] = pd.to_datetime(df['user.created'])
+df['date'] = pd.to_datetime(df['date'])
+
+#%%
+# sort and generate id
+df = df.sort_values(by='date').reset_index() # sort df by date before generating id
+df["tid"] = df.index + 1 # create id column
+
+#%%
+# move id column to front 
+cols = list(df.columns.values) # Make a list of all of the columns in the df
+cols.pop(cols.index('tid')) # Remove id from list
+#cols.pop(cols.index('user')) # Remove id from list
+df = df[['tid']+cols] # Create new dataframe with ordered colums
+
+#%%
+###################
+# Keywords
+# read additional keywords from a file and write to list.
+keywords = []
+# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
+# Read the keywords from a file
+with open(f"{di}own_keywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+# write all keywords to file
+with open(f"{di}keywords-raw.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+
+# delete keywords ppe and china that lead to too many false positives
+removeWords = {'ppe', 'china'}
+keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
+keywords = [item for item in keywords if item not in removeWords ] # removes words
+    
+with open(f"{di}keywords.txt", "w") as file:
+    print("read keyword files")
+    for line in keywords:
+        file.write(f'{line}\n')
+
+# counter keywords
+# Read the keywords from a file
+counterKeywords = []
+with open(f"{di}counterKeywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        counterKeyword = line.strip()  # Remove the newline character
+        counterKeywords.append(counterKeyword)
+counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
+with open(f"{di}counterKeywordsFinal.txt", "w") as file:
+    print("read keyword files")
+    for line in counterKeywords:
+        file.write(f'{line}\n')
+
+#%%
+# overwrite keyword column
+df['keywords'] = np.nan
+df['keywords'] = (
+    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
+)
+df['counterKeywords'] = np.nan
+df['counterKeywords'] = (
+    df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
+)
+#%%
+# create boolean contains_keyword column
+df['contains_keyword'] = True
+df['contains_counterKeyword'] = True
+mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
+mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
+
+#%%
+pd.Series(df["user.id"]).is_unique
+
+#%%
+# Merge Datasets
+# get senator data
+cols = [
+    "name",
+    "id",
+    "state_short",
+    "party",
+    "class",
+    "ideology",
+    "start_serving",
+    "end_serving",
+    "time_in_office",
+    "not_in_office",
+    "last_congress",
+    "vote_share",
+    "next_closest_share",
+    "election_year",
+    "twitter_handle",
+    "alt_handle",
+    "date_of_birth",
+    "female",
+    "ethnicity",
+    "edu_level",
+    "edu_information",
+    "occup_level"]
+
+dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+
+dfSenA['alt'] = False
+dfSenB['alt'] = True
+
+dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
+dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
+dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
+
+dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
+dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
+df['user.username'] = df['user.username'].apply(str.lower)
+
+dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
+
+# %%
+# see if all senators are present in file
+dfAll = df.merge(dfSenAll, how='left',on='user.username')
+#check merge
+unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
+print(unique_usernames)
+# senatorisakson was dropped, is ok
+#%%
+# create covidtweets csv
+dfCov = dfAll[dfAll['contains_counterKeyword']==False]
+dfCov = dfCov[dfCov['contains_keyword']==True]
+dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
+
+#%%
+# create column with tweet length
+
+dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
+
+# reset df index and write to id column 
+dfCov.reset_index(drop=True, inplace=True)
+
+#%%
+# Export to csv, sav and dta
+dfAll.to_csv(senCSVcPath, encoding='utf-8')
+dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
+# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
+# =============================================================================
+# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
+# dfAllStata = dfAll.rename(columns={'class':'class_'})
+# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
+# print(dfAllStata.columns)
+# ====================================================df.id.str.len().value_counts()
+# =========================
+
+# %%
+# Create training dataset
+np.random.seed(seed); 
+dfTrain = pd.dfCov(np.random.rand(1800))
+# %%
+# Create training dataset
+np.random.seed(seed); 
+dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
+dfTrain = dfTrain[['tid', 'date', 'rawContent']]
+dfTrain['topicCovid'] = True
+dfTrain['fake'] = False
+dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
--- a/collect.ipynb
+++ b/collect.ipynb
@ -1,960 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "15573d92-f6a7-49d4-9c01-fff33d23be8e",
-   "metadata": {},
-   "source": [
-    "# Tweet Collecting\n",
-    "## Requirements\n",
-    "- tweepy-4.14.0\n",
-    "- pandas-2.0\n",
-    "- numpy-1.24.3\n",
-    "\n",
-    "## Preparations & Config\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "3290c840-961c-4e2c-a107-4ccd541d151b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import tweepy\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import glob\n",
-    "import time\n",
-    "\n",
-    "# Define time period of interest\n",
-    "time_slices = [\n",
-    "    {\n",
-    "        \"start_time\": \"2020-01-01T00:00:00Z\",\n",
-    "        \"end_time\": \"2020-06-01T00:00:00Z\",\n",
-    "        \"suffix\": \"-slice1\"\n",
-    "    },\n",
-    "    {\n",
-    "        \"start_time\": \"2020-06-01T00:00:01Z\",\n",
-    "        \"end_time\": \"2021-01-01T00:00:00Z\",\n",
-    "        \"suffix\": \"-slice2\"\n",
-    "    },\n",
-    "    {\n",
-    "        \"start_time\": \"2021-01-01T00:00:01Z\",\n",
-    "        \"end_time\": \"2021-06-01T00:00:00Z\",\n",
-    "        \"suffix\": \"-slice3\"\n",
-    "    },\n",
-    "    {\n",
-    "        \"start_time\": \"2021-06-01T00:00:01Z\",\n",
-    "        \"end_time\": \"2023-01-03T00:00:00Z\",\n",
-    "        \"suffix\": \"-slice4\"\n",
-    "    }\n",
-    "]\n",
-    "\n",
-    "tweet_fields = [\n",
-    "\t\"id\",\n",
-    "\t\"text\",\n",
-    "\t\"attachments\",\n",
-    "\t\"author_id\",\n",
-    "\t\"context_annotations\",\n",
-    "\t\"conversation_id\",\n",
-    "\t\"created_at\",\n",
-    "\t\"entities\",\n",
-    "\t\"geo\",\n",
-    "\t\"lang\",\n",
-    "\t\"possibly_sensitive\",\n",
-    "\t\"public_metrics\",\n",
-    "\t\"referenced_tweets\",\n",
-    "\t\"reply_settings\",\n",
-    "\t\"source\",\n",
-    "\t\"withheld\",\n",
-    "\t]\n",
-    "\n",
-    "## Setup directories\n",
-    "# WD Michael\n",
-    "# wd = \"/home/michael/Documents/PS/Data/collectTweets/\"\n",
-    "\n",
-    "# WD Server\n",
-    "wd = \"/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection\"\n",
-    "\n",
-    "# WD Josie\n",
-    "# wd = \"/home/michael/Documents/PS/Data/\"\n",
-    "\n",
-    "# WD Sam\n",
-    "# wd = \"/home/michael/Documents/PS/Data/\"\n",
-    "\n",
-    "# Tweet-datafile directory\n",
-    "td = \"data/tweets/\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6782290c-7e14-4393-8caa-c78a2b326d85",
-   "metadata": {},
-   "source": [
-    "# Authenticate to Twitter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "7ac9b603-e638-4ebb-95df-e0f8678f298e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "## Setup Api-connection\n",
-    "bearer_token = \"AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc\"\n",
-    "client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e81c4d49-242c-4b51-8e2a-e2bbfdae6877",
-   "metadata": {},
-   "source": [
-    "## Import Keywords\n",
-    "Keywords from:\n",
-    "* Chen, E., Lerman, K., & Ferrara, E. (2020). Tracking Social Media Discourse About the COVID-19 Pandemic: Development of a Public Coronavirus Twitter Data Set. JMIR Public Health and Surveillance, 6(2), e19273. https://doi.org/10.2196/19273\n",
-    "Line 80 and following:\n",
-    "* Lamsal, R. (2020). Coronavirus (COVID-19) Tweets Dataset [Data set]. IEEE. https://ieee-dataport.org/open-access/coronavirus-covid-19-tweets-dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "1d4af102-30ae-4c73-ae9c-333efb34e3f1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Coronavirus',\n",
-       " 'Koronavirus',\n",
-       " 'Corona',\n",
-       " 'CDC',\n",
-       " 'Wuhancoronavirus',\n",
-       " 'Wuhanlockdown',\n",
-       " 'Ncov',\n",
-       " 'Wuhan',\n",
-       " 'N95',\n",
-       " 'Kungflu',\n",
-       " 'Epidemic',\n",
-       " 'outbreak',\n",
-       " 'Sinophobia',\n",
-       " 'China',\n",
-       " 'covid-19',\n",
-       " 'corona virus',\n",
-       " 'covid',\n",
-       " 'covid19',\n",
-       " 'sars-cov-2',\n",
-       " 'COVIDー19',\n",
-       " 'COVD',\n",
-       " 'pandemic',\n",
-       " 'coronapocalypse',\n",
-       " 'canceleverything',\n",
-       " 'Coronials',\n",
-       " 'SocialDistancingNow',\n",
-       " 'Social Distancing',\n",
-       " 'SocialDistancing',\n",
-       " 'panicbuy',\n",
-       " 'panic buy',\n",
-       " 'panicbuying',\n",
-       " 'panic buying',\n",
-       " '14DayQuarantine',\n",
-       " 'DuringMy14DayQuarantine',\n",
-       " 'panic shop',\n",
-       " 'panic shopping',\n",
-       " 'panicshop',\n",
-       " 'InMyQuarantineSurvivalKit',\n",
-       " 'panic-buy',\n",
-       " 'panic-shop',\n",
-       " 'coronakindness',\n",
-       " 'quarantinelife',\n",
-       " 'chinese virus',\n",
-       " 'chinesevirus',\n",
-       " 'stayhomechallenge',\n",
-       " 'stay home challenge',\n",
-       " 'sflockdown',\n",
-       " 'DontBeASpreader',\n",
-       " 'lockdown',\n",
-       " 'lock down',\n",
-       " 'shelteringinplace',\n",
-       " 'sheltering in place',\n",
-       " 'staysafestayhome',\n",
-       " 'stay safe stay home',\n",
-       " 'trumppandemic',\n",
-       " 'trump pandemic',\n",
-       " 'flattenthecurve',\n",
-       " 'flatten the curve',\n",
-       " 'china virus',\n",
-       " 'chinavirus',\n",
-       " 'quarentinelife',\n",
-       " 'PPEshortage',\n",
-       " 'saferathome',\n",
-       " 'stayathome',\n",
-       " 'stay at home',\n",
-       " 'stay home',\n",
-       " 'stayhome',\n",
-       " 'GetMePPE',\n",
-       " 'covidiot',\n",
-       " 'epitwitter',\n",
-       " 'pandemie',\n",
-       " 'wear a mask',\n",
-       " 'wearamask',\n",
-       " 'kung flu',\n",
-       " 'covididiot',\n",
-       " 'COVID__19',\n",
-       " 'omicron',\n",
-       " 'variant',\n",
-       " 'vaccine',\n",
-       " 'travel ban',\n",
-       " 'corona',\n",
-       " 'corona',\n",
-       " 'coronavirus',\n",
-       " 'coronavirus',\n",
-       " 'covid',\n",
-       " 'covid',\n",
-       " 'covid19',\n",
-       " 'covid19',\n",
-       " 'covid-19',\n",
-       " 'covid-19',\n",
-       " 'sarscov2',\n",
-       " 'sarscov2',\n",
-       " 'sars cov2',\n",
-       " 'sars cov 2',\n",
-       " 'covid_19',\n",
-       " 'covid_19',\n",
-       " 'ncov',\n",
-       " 'ncov',\n",
-       " 'ncov2019',\n",
-       " 'ncov2019',\n",
-       " '2019-ncov',\n",
-       " '2019-ncov',\n",
-       " 'pandemic',\n",
-       " 'pandemic 2019ncov',\n",
-       " '2019ncov',\n",
-       " 'quarantine',\n",
-       " 'quarantine',\n",
-       " 'flatten the curve',\n",
-       " 'flattening the curve',\n",
-       " 'flatteningthecurve',\n",
-       " 'flattenthecurve',\n",
-       " 'hand sanitizer',\n",
-       " 'handsanitizer',\n",
-       " 'lockdown',\n",
-       " 'lockdown',\n",
-       " 'social distancing',\n",
-       " 'socialdistancing',\n",
-       " 'work from home',\n",
-       " 'workfromhome',\n",
-       " 'working from home',\n",
-       " 'workingfromhome',\n",
-       " 'ppe',\n",
-       " 'n95',\n",
-       " 'ppe',\n",
-       " 'n95',\n",
-       " 'covidiots',\n",
-       " 'covidiots',\n",
-       " 'herd immunity',\n",
-       " 'herdimmunity',\n",
-       " 'pneumonia',\n",
-       " 'pneumonia',\n",
-       " 'chinese virus',\n",
-       " 'chinesevirus',\n",
-       " 'wuhan virus',\n",
-       " 'wuhanvirus',\n",
-       " 'kung flu',\n",
-       " 'kungflu',\n",
-       " 'wearamask',\n",
-       " 'wearamask',\n",
-       " 'wear a mask',\n",
-       " 'vaccine',\n",
-       " 'vaccines',\n",
-       " 'vaccine',\n",
-       " 'vaccines',\n",
-       " 'corona vaccine',\n",
-       " 'corona vaccines',\n",
-       " 'coronavaccine',\n",
-       " 'coronavaccines',\n",
-       " 'face shield',\n",
-       " 'faceshield',\n",
-       " 'face shields',\n",
-       " 'faceshields',\n",
-       " 'health worker',\n",
-       " 'healthworker',\n",
-       " 'health workers',\n",
-       " 'healthworkers',\n",
-       " 'stayhomestaysafe',\n",
-       " 'coronaupdate',\n",
-       " 'frontlineheroes',\n",
-       " 'coronawarriors',\n",
-       " 'homeschool',\n",
-       " 'homeschooling',\n",
-       " 'hometasking',\n",
-       " 'masks4all',\n",
-       " 'wfh',\n",
-       " 'wash ur hands',\n",
-       " 'wash your hands',\n",
-       " 'washurhands',\n",
-       " 'washyourhands',\n",
-       " 'stayathome',\n",
-       " 'stayhome',\n",
-       " 'selfisolating',\n",
-       " 'self isolating']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "keywords = []\n",
-    "\n",
-    "# Read the keywords from a file\n",
-    "with open(\"data/keywords.txt\", \"r\") as file:\n",
-    "    lines = file.readlines()\n",
-    "    for line in lines:\n",
-    "        keyword = line.strip()  # Remove the newline character\n",
-    "        keywords.append(keyword)\n",
-    "\n",
-    "keywords"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9f190608-c0a2-4e7e-9560-a03a57aa4132",
-   "metadata": {},
-   "source": [
-    "## Import Accounts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "a5bde33c-cc69-43ad-9b0c-4b04ce7f8a3c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['SenAlexander', 'SenatorEnzi', 'CoryGardner', 'VP', 'SenatorIsakson', 'DougJones', 'KLoeffler', 'MarthaMcSallyAZ', 'DavidPerdueGA', 'SenPatRoberts', 'SenatorTomUdall', 'SenatorBaldwin', 'SenJohnBarrasso', 'SenatorBennet', 'MarshaBlackburn', 'SenBlumenthal', 'RoyBlunt', 'senbooker', 'JohnBoozman', 'SenatorBraun', 'SenSherrodBrown', 'SenatorBurr', 'SenatorCantwell', 'SenCapito', 'SenatorCardin', 'SenatorCarper', 'SenBobCasey', 'SenBillCassidy', 'SenatorCollins', 'ChrisCoons', 'JohnCornyn', 'SenCortezMasto', 'SenTomCotton', 'SenKevinCramer', 'MikeCrapo', 'SenTedCruz', 'SteveDaines', 'SenDuckworth', 'SenatorDurbin', 'SenJoniErnst', 'SenFettermanPA', 'SenFeinstein', 'SenatorFischer', 'SenGillibrand', 'LindseyGrahamSC', 'ChuckGrassley', 'SenatorHagerty', 'SenatorHassan', 'HawleyMO', 'MartinHeinrich', 'SenatorHick', 'maziehirono', 'SenJohnHoeven', 'SenHydeSmith', 'JimInhofe', 'SenRonJohnson', 'timkaine', 'SenMarkKelly', 'SenJohnKennedy', 'SenAngusKing', 'SenAmyKlobuchar', 'SenatorLankford', 'SenatorLeahy', 'SenMikeLee', 'SenatorLujan', 'SenLummis', 'Sen_JoeManchin', 'SenMarkey', 'SenatorMarshall', 'LeaderMcConnell', 'SenatorMenendez', 'SenJeffMerkley', 'JerryMoran', 'lisamurkowski', 'ChrisMurphyCT', 'PattyMurray', 'SenOssoff', 'SenAlexPadilla', 'senrandpaul', 'SenGaryPeters', 'senrobportman', 'SenJackReed', 'SenatorRisch', 'SenatorRomney', 'SenJackyRosen', 'SenatorRounds', 'senmarcorubio', 'SenSanders', 'sensasse', 'brianschatz', 'SenSchumer', 'SenRickScott', 'SenatorTimScott', 'SenatorShaheen', 'SenShelby', 'SenatorSinema', 'SenTinaSmith', 'SenStabenow', 'SenDanSullivan', 'SenatorTester', 'SenJohnThune', 'SenThomTillis', 'SenToomey', 'SenTuberville', 'ChrisVanHollen', 'MarkWarner', 'SenatorWarnock', 'ewarren', 'SenWhitehouse', 'SenatorWicker', 'RonWyden', 'SenToddYoung']\n",
-      "['LamarAlexander ', nan, 'corygardner', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Get accounts & alt-accounts from Senators-Datafile\n",
-    "accounts = pd.read_csv(\"data/senators-raw.csv\")[\"twitter_handle\"].tolist()\n",
-    "alt_accounts = pd.read_csv(\"data/senators-raw.csv\")[\"alt_handle\"].tolist()\n",
-    "print(accounts)\n",
-    "print(alt_accounts)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "befc0fad-c803-4145-a041-570d6f894178",
-   "metadata": {},
-   "source": [
-    "## Collect Tweets\n",
-    "Loops over accounts:\n",
-    "* Collects Tweets of account. \n",
-    "* Then extracts columns public_metrics (likes aso) and referenced_tweets (indicates, whether tweet is a reply).\n",
-    "* Checks if tweet-text contains any of the keywords, if so, inserts the keyword(s) in a new column.\n",
-    "* Saves tweets of the account in a csv file \"HANDLE.csv\" and \"HANDLE-LONG.csv\" (LONG contains all given information such as annotations, that we might or might not need)\n",
-    "\n",
-    "### Problem:\n",
-    "_I limited the results to 20 tweets per senator._\n",
-    "Twitter has the following API Limit for the [search_all_tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) method I used: \n",
-    "* App rate limit (Application-only): 300 requests per 15-minute window shared among all users of your app\n",
-    "* App rate limit (Application-only): 1 per second shared among all users of your app\n",
-    "\n",
-    "With a limit of 300, I request 20 posts per slice, just to get a better understanding of what's happening. After trying different things out, I think that the time-slices won't be needed if we get around the problem I'm having right now:\n",
-    "as soon, as the rate limit is reached, tweepy stops and waits for the time to run out and start again. BUT it doesn't retry the request but starts with the next request. \n",
-    "I haven't found anything and my only idea to solve the problem was to generate a list of failed attempts (via try and except) and after getting all tweets letting tweepy work over that list again. \n",
-    "One more thing I don't understand is that, when fetching the tweets I already sent to you, I didn't have as many problems as now and the limit exceeded after 3-4 senators, even though I used a higher `max_result` and a higher `flatten value`.\n",
-    "\n",
-    "I hope that the following output speaks for itself:\n",
-    "```\n",
-    "trying to fetch tweets for SenAlexander-slice1\n",
-    "trying to fetch tweets for SenAlexander-slice2\n",
-    "trying to fetch tweets for SenAlexander-slice3\n",
-    "trying to fetch tweets for SenAlexander-slice4\n",
-    "trying to fetch tweets for SenatorEnzi-slice1\n",
-    "trying to fetch tweets for SenatorEnzi-slice2\n",
-    "trying to fetch tweets for SenatorEnzi-slice3\n",
-    "return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-    "trying to fetch tweets for SenatorEnzi-slice4\n",
-    "\n",
-    "Rate limit exceeded. Sleeping for 893 seconds.\n",
-    "```\n",
-    "\n",
-    "Tweepy returned no tweets because of the exceeded tweet limit, then the script tried to fetch more tweets and the error message came up.\n",
-    "Before changing the code below, see the other version i wrote just below the next cell (and ignore the error message below the cell as i just interrupted the execution which lead to the error message)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "0f842b8a-846a-4f38-8231-c1e9ccfbddf5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "trying to fetch tweets for SenAlexander-slice1\n",
-      "trying to fetch tweets for SenAlexander-slice2\n",
-      "trying to fetch tweets for SenAlexander-slice3\n",
-      "trying to fetch tweets for SenAlexander-slice4\n",
-      "trying to fetch tweets for SenatorEnzi-slice1\n",
-      "trying to fetch tweets for SenatorEnzi-slice2\n",
-      "trying to fetch tweets for SenatorEnzi-slice3\n",
-      "return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "<generator object Paginator.flatten at 0x7f20ebf137b0>\n",
-      "trying to fetch tweets for SenatorEnzi-slice4\n",
-      "trying to fetch tweets for CoryGardner-slice1\n",
-      "trying to fetch tweets for CoryGardner-slice2\n",
-      "trying to fetch tweets for CoryGardner-slice3\n",
-      "return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "<generator object Paginator.flatten at 0x7f20ebf13740>\n",
-      "trying to fetch tweets for CoryGardner-slice4\n",
-      "trying to fetch tweets for VP-slice1\n",
-      "trying to fetch tweets for VP-slice2\n",
-      "trying to fetch tweets for VP-slice3\n",
-      "trying to fetch tweets for VP-slice4\n",
-      "trying to fetch tweets for SenatorIsakson-slice1\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[28], line 30\u001b[0m\n\u001b[1;32m     22\u001b[0m tweets \u001b[38;5;241m=\u001b[39m tweepy\u001b[38;5;241m.\u001b[39mPaginator(client\u001b[38;5;241m.\u001b[39msearch_all_tweets,\n\u001b[1;32m     23\u001b[0m                               query\u001b[38;5;241m=\u001b[39mquery,\n\u001b[1;32m     24\u001b[0m                               tweet_fields\u001b[38;5;241m=\u001b[39mtweet_fields,\n\u001b[1;32m     25\u001b[0m                               start_time\u001b[38;5;241m=\u001b[39mstart_time,\n\u001b[1;32m     26\u001b[0m                               end_time\u001b[38;5;241m=\u001b[39mend_time,\n\u001b[1;32m     27\u001b[0m                               max_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m)\u001b[38;5;241m.\u001b[39mflatten(\u001b[38;5;241m20\u001b[39m)\n\u001b[1;32m     29\u001b[0m \u001b[38;5;66;03m# for each tweet returned...\u001b[39;00m\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m tweet \u001b[38;5;129;01min\u001b[39;00m tweets:\n\u001b[1;32m     31\u001b[0m     \u001b[38;5;66;03m# ... add that tweet to tweetlist\u001b[39;00m\n\u001b[1;32m     32\u001b[0m     tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n\u001b[1;32m     34\u001b[0m \u001b[38;5;66;03m# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\u001b[39;00m\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:67\u001b[0m, in \u001b[0;36mPaginator.flatten\u001b[0;34m(self, limit)\u001b[0m\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m     66\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m response \u001b[38;5;129;01min\u001b[39;00m PaginationIterator(\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmethod, \u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs\n\u001b[1;32m     69\u001b[0m ):\n\u001b[1;32m     70\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n\u001b[1;32m     71\u001b[0m         response_data \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;129;01mor\u001b[39;00m []\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:126\u001b[0m, in \u001b[0;36mPaginationIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpagination_token\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pagination_token\n\u001b[0;32m--> 126\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n\u001b[1;32m    129\u001b[0m     meta \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mmeta\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:1163\u001b[0m, in \u001b[0;36mClient.search_all_tweets\u001b[0;34m(self, query, **params)\u001b[0m\n\u001b[1;32m   1071\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"search_all_tweets( \\\u001b[39;00m\n\u001b[1;32m   1072\u001b[0m \u001b[38;5;124;03m    query, *, end_time=None, expansions=None, max_results=None, \\\u001b[39;00m\n\u001b[1;32m   1073\u001b[0m \u001b[38;5;124;03m    media_fields=None, next_token=None, place_fields=None, \\\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1160\u001b[0m \u001b[38;5;124;03m.. _pagination: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate\u001b[39;00m\n\u001b[1;32m   1161\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1162\u001b[0m params[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m query\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1164\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/2/tweets/search/all\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1165\u001b[0m \u001b[43m    \u001b[49m\u001b[43mendpoint_parameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1166\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mend_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpansions\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_results\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedia.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1167\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnext_token\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mplace.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpoll.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1168\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msince_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msort_order\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstart_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtweet.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muntil_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTweet\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:129\u001b[0m, in \u001b[0;36mBaseClient._make_request\u001b[0;34m(self, method, route, params, endpoint_parameters, json, data_type, user_auth)\u001b[0m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28mself\u001b[39m, method, route, params\u001b[38;5;241m=\u001b[39m{}, endpoint_parameters\u001b[38;5;241m=\u001b[39m(), json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    125\u001b[0m     data_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, user_auth\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    126\u001b[0m ):\n\u001b[1;32m    127\u001b[0m     request_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_params(params, endpoint_parameters)\n\u001b[0;32m--> 129\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    130\u001b[0m \u001b[43m                            \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_auth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_auth\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    132\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_type \u001b[38;5;129;01mis\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mResponse:\n\u001b[1;32m    133\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:84\u001b[0m, in \u001b[0;36mBaseClient.request\u001b[0;34m(self, method, route, params, json, user_auth)\u001b[0m\n\u001b[1;32m     75\u001b[0m     headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAuthorization\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBearer \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbearer_token\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     77\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMaking API request: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhost\u001b[38;5;250m \u001b[39m\u001b[38;5;241m+\u001b[39m\u001b[38;5;250m \u001b[39mroute\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     79\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParameters: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparams\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     80\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHeaders: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mheaders\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     81\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBody: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjson\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     82\u001b[0m )\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhost\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m    \u001b[49m\u001b[43mauth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauth\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m response:\n\u001b[1;32m     88\u001b[0m     log\u001b[38;5;241m.\u001b[39mdebug(\n\u001b[1;32m     89\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReceived API response: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     90\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mreason\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     91\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHeaders: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mheaders\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     92\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mcontent\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     93\u001b[0m     )\n\u001b[1;32m     95\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m400\u001b[39m:\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m    585\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m    586\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m    587\u001b[0m }\n\u001b[1;32m    588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m    702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m    706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    483\u001b[0m     timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m    485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m     resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    487\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    488\u001b[0m \u001b[43m        \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    489\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[43m        \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    491\u001b[0m \u001b[43m        \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    492\u001b[0m \u001b[43m        \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    495\u001b[0m \u001b[43m        \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m        \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m    501\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:790\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m    787\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    789\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 790\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    791\u001b[0m \u001b[43m    \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    792\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    793\u001b[0m \u001b[43m    \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    794\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    795\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    796\u001b[0m \u001b[43m    \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    797\u001b[0m \u001b[43m    \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    798\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    799\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    800\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    801\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    802\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    803\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    805\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m    806\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:536\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m    534\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m    535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 536\u001b[0m     response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    537\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    538\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connection.py:454\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mresponse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPResponse\n\u001b[1;32m    453\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 454\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    456\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    457\u001b[0m     assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n",
-      "File \u001b[0;32m/usr/lib/python3.9/http/client.py:1347\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1345\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1346\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1347\u001b[0m         \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1348\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m   1349\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
-      "File \u001b[0;32m/usr/lib/python3.9/http/client.py:307\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    305\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 307\u001b[0m     version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m    309\u001b[0m         \u001b[38;5;28;01mbreak\u001b[39;00m\n",
-      "File \u001b[0;32m/usr/lib/python3.9/http/client.py:268\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    267\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 268\u001b[0m     line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    269\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m    270\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m/usr/lib/python3.9/socket.py:704\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    702\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m    703\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 704\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    705\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m    706\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
-      "File \u001b[0;32m/usr/lib/python3.9/ssl.py:1241\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m   1237\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m   1238\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m   1239\u001b[0m           \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m   1240\u001b[0m           \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1241\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1242\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1243\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
-      "File \u001b[0;32m/usr/lib/python3.9/ssl.py:1099\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1098\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1099\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1100\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1101\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "# Iterate over each Twitter account\n",
-    "for handle in accounts:\n",
-    "    for slice_data in time_slices:\n",
-    "        # sleep 1 second to not get over 1sec api limit\n",
-    "        time.sleep(1) \n",
-    "        # define slice data variables from time_slices\n",
-    "        start_time = slice_data['start_time']\n",
-    "        end_time = slice_data['end_time']\n",
-    "        suffix = slice_data['suffix']\n",
-    "        \n",
-    "        # define tweepy query with twitter handle of current sen\n",
-    "        query = f'from:{handle} -is:retweet'\n",
-    "        \n",
-    "        # create empty tweetlist that will be filled with tweets of current sen\n",
-    "        tweetlist = []\n",
-    "        \n",
-    "        # statusmsg\n",
-    "        msg = f'trying to fetch tweets for {handle}{suffix}'\n",
-    "        print(msg)\n",
-    "        \n",
-    "        # Fetch tweets using tweepy Twitter API v2 pagination\n",
-    "        tweets = tweepy.Paginator(client.search_all_tweets,\n",
-    "                                      query=query,\n",
-    "                                      tweet_fields=tweet_fields,\n",
-    "                                      start_time=start_time,\n",
-    "                                      end_time=end_time,\n",
-    "                                      max_results=20).flatten(20)\n",
-    "        \n",
-    "        # for each tweet returned...\n",
-    "        for tweet in tweets:\n",
-    "            # ... add that tweet to tweetlist\n",
-    "            tweetlist.append(tweet)\n",
-    "        \n",
-    "        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
-    "        if len(tweetlist) == 0:\n",
-    "            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
-    "            print(msg)\n",
-    "            print(tweets)\n",
-    "            continue\n",
-    "        \n",
-    "        # convert to dataframe\n",
-    "        tweet_df = pd.DataFrame(tweetlist)\n",
-    "        \n",
-    "        # add handle column as api only provides user-ids\n",
-    "        tweet_df['handle'] = handle\n",
-    "        \n",
-    "        ## Extract referenced_tweet info from column\n",
-    "        tweet_df['referenced_tweet_type'] = None\n",
-    "        tweet_df['referenced_tweet_id'] = None\n",
-    "        \n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if 'referenced_tweets' in tweet_df.columns:\n",
-    "            for index, row in tweet_df.iterrows():\n",
-    "                referenced_tweets = row['referenced_tweets']\n",
-    "                \n",
-    "                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
-    "                    referenced_tweet = referenced_tweets[0]\n",
-    "                    referenced_tweet_type = referenced_tweet['type']\n",
-    "                    referenced_tweet_id = referenced_tweet['id']\n",
-    "                    \n",
-    "                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
-    "                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
-    "        \n",
-    "        ## Check if tweet-text contains keyword\n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if 'text' in tweet_df.columns:\n",
-    "            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
-    "                                              .str.join(',')\n",
-    "                                              .replace('', 'none'))\n",
-    "        \n",
-    "        ## Save two versions of the dataset, one with all fields and one without dict fields\n",
-    "        # define filepaths\n",
-    "        csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
-    "        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
-    "        # save LONG csv\n",
-    "        tweet_df.to_csv(csv_path2)\n",
-    "        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files\n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
-    "            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
-    "        # save short csv\n",
-    "        tweet_df.to_csv(csv_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cb779d9a-cecb-475c-9e76-22c9b8c1928d",
-   "metadata": {},
-   "source": [
-    "## Alternative way to fetch tweets via tweepy with retry mechanism"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "c3b4a2ba-46e2-478b-9558-7d6999fdcd69",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "trying to fetch tweets for SenAlexander-slice1\n",
-      "trying to fetch tweets for SenAlexander-slice2\n",
-      "trying to fetch tweets for SenAlexander-slice3\n",
-      "trying to fetch tweets for SenAlexander-slice4\n",
-      "trying to fetch tweets for SenatorEnzi-slice1\n",
-      "trying to fetch tweets for SenatorEnzi-slice2\n",
-      "trying to fetch tweets for SenatorEnzi-slice3\n",
-      "return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "trying to fetch tweets for SenatorEnzi-slice4\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Rate limit exceeded. Sleeping for 437 seconds.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "trying to fetch tweets for CoryGardner-slice1\n",
-      "trying to fetch tweets for CoryGardner-slice2\n",
-      "trying to fetch tweets for CoryGardner-slice3\n",
-      "return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "trying to fetch tweets for CoryGardner-slice4\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Rate limit exceeded. Sleeping for 897 seconds.\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "module 'tweepy' has no attribute 'TweepError'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[21], line 33\u001b[0m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;66;03m# for each tweet returned...\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m tweet \u001b[38;5;129;01min\u001b[39;00m tweets:\n\u001b[1;32m     34\u001b[0m     \u001b[38;5;66;03m# ... add that tweet to tweetlist\u001b[39;00m\n\u001b[1;32m     35\u001b[0m     tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:67\u001b[0m, in \u001b[0;36mPaginator.flatten\u001b[0;34m(self, limit)\u001b[0m\n\u001b[1;32m     66\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m response \u001b[38;5;129;01min\u001b[39;00m PaginationIterator(\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmethod, \u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs\n\u001b[1;32m     69\u001b[0m ):\n\u001b[1;32m     70\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:126\u001b[0m, in \u001b[0;36mPaginationIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpagination_token\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pagination_token\n\u001b[0;32m--> 126\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:1163\u001b[0m, in \u001b[0;36mClient.search_all_tweets\u001b[0;34m(self, query, **params)\u001b[0m\n\u001b[1;32m   1162\u001b[0m params[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m query\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1164\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/2/tweets/search/all\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1165\u001b[0m \u001b[43m    \u001b[49m\u001b[43mendpoint_parameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1166\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mend_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpansions\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_results\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedia.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1167\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnext_token\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mplace.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpoll.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1168\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msince_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msort_order\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstart_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtweet.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muntil_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTweet\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:129\u001b[0m, in \u001b[0;36mBaseClient._make_request\u001b[0;34m(self, method, route, params, endpoint_parameters, json, data_type, user_auth)\u001b[0m\n\u001b[1;32m    127\u001b[0m request_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_params(params, endpoint_parameters)\n\u001b[0;32m--> 129\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    130\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_auth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_auth\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_type \u001b[38;5;129;01mis\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mResponse:\n",
-      "File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:112\u001b[0m, in \u001b[0;36mBaseClient.request\u001b[0;34m(self, method, route, params, json, user_auth)\u001b[0m\n\u001b[1;32m    108\u001b[0m     log\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m    109\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRate limit exceeded. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    110\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSleeping for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msleep_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m seconds.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    111\u001b[0m     )\n\u001b[0;32m--> 112\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43msleep_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(method, route, params, json, user_auth)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[21], line 39\u001b[0m\n\u001b[1;32m     35\u001b[0m         tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m  \u001b[38;5;66;03m# exit the retry loop if tweets are successfully fetched\u001b[39;00m\n\u001b[0;32m---> 39\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[43mtweepy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTweepError\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     40\u001b[0m     \u001b[38;5;66;03m# handle rate limit exceeded error\u001b[39;00m\n\u001b[1;32m     41\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m e\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m429\u001b[39m:\n\u001b[1;32m     42\u001b[0m         \u001b[38;5;66;03m# get the rate limit reset time from the response headers\u001b[39;00m\n\u001b[1;32m     43\u001b[0m         reset_time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(e\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mheaders[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx-rate-limit-reset\u001b[39m\u001b[38;5;124m'\u001b[39m])\n",
-      "\u001b[0;31mAttributeError\u001b[0m: module 'tweepy' has no attribute 'TweepError'"
-     ]
-    }
-   ],
-   "source": [
-    "# Iterate over each Twitter account\n",
-    "for handle in accounts:\n",
-    "    for slice_data in time_slices:\n",
-    "        # define slice data variables from time_slices\n",
-    "        start_time = slice_data['start_time']\n",
-    "        end_time = slice_data['end_time']\n",
-    "        suffix = slice_data['suffix']\n",
-    "        \n",
-    "        # define tweepy query with twitter handle of current sen\n",
-    "        query = f'from:{handle} -is:retweet'\n",
-    "        \n",
-    "        # create empty tweetlist that will be filled with tweets of current sen\n",
-    "        tweetlist = []\n",
-    "        \n",
-    "        # statusmsg\n",
-    "        msg = f'trying to fetch tweets for {handle}{suffix}'\n",
-    "        print(msg)\n",
-    "        \n",
-    "        # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism\n",
-    "        max_attempts = 3  # maximum number of attempts to fetch tweets for a slice\n",
-    "        attempt = 1\n",
-    "        \n",
-    "        while attempt <= max_attempts:\n",
-    "            try:\n",
-    "                tweets = tweepy.Paginator(client.search_all_tweets,\n",
-    "                                          query=query,\n",
-    "                                          tweet_fields=tweet_fields,\n",
-    "                                          start_time=start_time,\n",
-    "                                          end_time=end_time,\n",
-    "                                          max_results=20).flatten(20)\n",
-    "                \n",
-    "                # for each tweet returned...\n",
-    "                for tweet in tweets:\n",
-    "                    # ... add that tweet to tweetlist\n",
-    "                    tweetlist.append(tweet)\n",
-    "                \n",
-    "                break  # exit the retry loop if tweets are successfully fetched\n",
-    "            \n",
-    "            except tweepy.TweepError as e:\n",
-    "                # handle rate limit exceeded error\n",
-    "                if e.response.status_code == 429:\n",
-    "                    # get the rate limit reset time from the response headers\n",
-    "                    reset_time = int(e.response.headers['x-rate-limit-reset'])\n",
-    "                    current_time = int(time.time())\n",
-    "                    \n",
-    "                    # calculate the sleep time until the rate limit resets\n",
-    "                    sleep_time = reset_time - current_time + 1  # add an extra second\n",
-    "                    \n",
-    "                    # sleep until the rate limit resets\n",
-    "                    time.sleep(sleep_time)\n",
-    "                    \n",
-    "                    attempt += 1  # increment the attempt counter\n",
-    "                    continue  # retry the API call\n",
-    "                \n",
-    "                else:\n",
-    "                    # handle other types of Tweepy errors\n",
-    "                    print(f'Error occurred: {e}')\n",
-    "                    break\n",
-    "        \n",
-    "        # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
-    "        if len(tweetlist) == 0:\n",
-    "            msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
-    "            print(msg)\n",
-    "            continue\n",
-    "        \n",
-    "        # convert to dataframe\n",
-    "        tweet_df = pd.DataFrame(tweetlist)\n",
-    "        \n",
-    "        # add handle column as api only provides user-ids\n",
-    "        tweet_df['handle'] = handle\n",
-    "        \n",
-    "        ## Extract referenced_tweet info from column\n",
-    "        tweet_df['referenced_tweet_type'] = None\n",
-    "        tweet_df['referenced_tweet_id'] = None\n",
-    "        \n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if 'referenced_tweets' in tweet_df.columns:\n",
-    "            for index, row in tweet_df.iterrows():\n",
-    "                referenced_tweets = row['referenced_tweets']\n",
-    "                \n",
-    "                if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
-    "                    referenced_tweet = referenced_tweets[0]\n",
-    "                    referenced_tweet_type = referenced_tweet['type']\n",
-    "                    referenced_tweet_id = referenced_tweet['id']\n",
-    "                    \n",
-    "                    tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
-    "                    tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
-    "        \n",
-    "        ## Check if tweet-text contains keyword\n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if 'text' in tweet_df.columns:\n",
-    "            tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
-    "                                              .str.join(',')\n",
-    "                                              .replace('', 'none'))\n",
-    "        \n",
-    "        ## Save two versions of the dataset, one with all fields and one without dict fields\n",
-    "        # define filepaths\n",
-    "        csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
-    "        csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
-    "        # save LONG csv\n",
-    "        tweet_df.to_csv(csv_path2)\n",
-    "        # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files\n",
-    "        # if cond. because in some cases column doesn't exist\n",
-    "        if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
-    "            tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
-    "        # save short csv\n",
-    "        tweet_df.to_csv(csv_path)\n",
-    "        \n",
-    "        # sleep 1 second to not exceed the API rate limit\n",
-    "        time.sleep(1)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b5dd5498-1ba4-4f0a-9bb9-ffce4655212d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "path_to_tweetdfs = wd + td\n",
-    "os.chdir(path_to_tweetdfs)\n",
-    "tweetfiles = glob.glob('*.{}'.format(\"csv\"))\n",
-    "\n",
-    "print(tweetfiles)\n",
-    "\n",
-    "# save merged csv as two files \n",
-    "df_all_senators = pd.DataFrame()\n",
-    "df_all_senators_long = pd.DataFrame()\n",
-    "for file in tweetfiles:\n",
-    "\tif \"LONG\" in file:\n",
-    "\t\tdf = pd.read_csv(file)\n",
-    "\t\tdf_all_senators_long = pd.concat([df, df_all_senators_long])\n",
-    "\telse:\n",
-    "\t\tdf = pd.read_csv(file)\n",
-    "\t\tdf_all_senators = pd.concat([df, df_all_senators])\n",
-    "csv_path = td + \"ALL-SENATORS.csv\"\n",
-    "csv_path2 = td + \"ALL-SENATORS-LONG-LONG.csv\"\n",
-    "df_all_senators.to_csv(csv_path)    \n",
-    "df_all_senators_long.to_csv(csv_path2)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "320ebbf4-8eaf-4189-836b-5d5aa8a0a263",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "trying to fetch tweets for SenAlexander-slice1\n",
-      "trying to fetch tweets for SenAlexander-slice2\n",
-      "trying to fetch tweets for SenAlexander-slice3\n",
-      "trying to fetch tweets for SenAlexander-slice4\n",
-      "trying to fetch tweets for SenatorEnzi-slice1\n",
-      "trying to fetch tweets for SenatorEnzi-slice2\n",
-      "trying to fetch tweets for SenatorEnzi-slice3\n",
-      "return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "trying to fetch tweets for SenatorEnzi-slice4\n",
-      "trying to fetch tweets for CoryGardner-slice1\n",
-      "trying to fetch tweets for CoryGardner-slice2\n",
-      "trying to fetch tweets for CoryGardner-slice3\n",
-      "return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
-      "trying to fetch tweets for CoryGardner-slice4\n",
-      "trying to fetch tweets for VP-slice1\n",
-      "trying to fetch tweets for VP-slice2\n",
-      "trying to fetch tweets for VP-slice3\n",
-      "trying to fetch tweets for VP-slice4\n",
-      "trying to fetch tweets for SenatorIsakson-slice1\n",
-      "trying to fetch tweets for SenatorIsakson-slice2\n",
-      "trying to fetch tweets for SenatorIsakson-slice3\n",
-      "trying to fetch tweets for SenatorIsakson-slice4\n",
-      "trying to fetch tweets for DougJones-slice1\n",
-      "trying to fetch tweets for DougJones-slice2\n",
-      "trying to fetch tweets for DougJones-slice3\n",
-      "trying to fetch tweets for DougJones-slice4\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[24], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m handle \u001b[38;5;129;01min\u001b[39;00m accounts:\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m slice_data \u001b[38;5;129;01min\u001b[39;00m time_slices:\n\u001b[0;32m----> 4\u001b[0m         \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      5\u001b[0m         \u001b[38;5;66;03m# define slice data variables from time_slices\u001b[39;00m\n\u001b[1;32m      6\u001b[0m         start_time \u001b[38;5;241m=\u001b[39m slice_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstart_time\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "# Iterate over each Twitter account\n",
-    "for handle in accounts:\n",
-    "    for slice_data in time_slices:\n",
-    "        time.sleep(1.01)\n",
-    "        # define slice data variables from time_slices\n",
-    "        start_time = slice_data['start_time']\n",
-    "        end_time = slice_data['end_time']\n",
-    "        suffix = slice_data['suffix']\n",
-    "        \n",
-    "        # define tweepy query with twitter handle of current sen\n",
-    "        query = f'from:{handle} -is:retweet'\n",
-    "        \n",
-    "        # create empty tweetlist that will be filled with tweets of current sen\n",
-    "        tweetlist = []\n",
-    "        \n",
-    "        # statusmsg\n",
-    "        msg = f'trying to fetch tweets for {handle}{suffix}'\n",
-    "        print(msg)\n",
-    "        \n",
-    "        # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism\n",
-    "        max_attempts = 3  # maximum number of attempts to fetch tweets for a slice\n",
-    "        attempt = 1\n",
-    "        \n",
-    "        while attempt <= max_attempts:\n",
-    "            try:\n",
-    "                tweets = tweepy.Paginator(client.search_all_tweets,\n",
-    "                                          query=query,\n",
-    "                                          tweet_fields=tweet_fields,\n",
-    "                                          start_time=start_time,\n",
-    "                                          end_time=end_time,\n",
-    "                                          max_results=20).flatten(20)\n",
-    "                \n",
-    "                # for each tweet returned...\n",
-    "                for tweet in tweets:\n",
-    "                    # ... add that tweet to tweetlist\n",
-    "                    tweetlist.append(tweet)\n",
-    "                \n",
-    "                # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
-    "                if len(tweetlist) == 0:\n",
-    "                    msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
-    "                    print(msg)\n",
-    "                    break\n",
-    "                \n",
-    "                # convert to dataframe\n",
-    "                tweet_df = pd.DataFrame(tweetlist)\n",
-    "                \n",
-    "                # add handle column as API only provides user-ids\n",
-    "                tweet_df['handle'] = handle\n",
-    "                \n",
-    "                ## Extract referenced_tweet info from column\n",
-    "                tweet_df['referenced_tweet_type'] = None\n",
-    "                tweet_df['referenced_tweet_id'] = None\n",
-    "                \n",
-    "                # if cond. because in some cases column doesn't exist\n",
-    "                if 'referenced_tweets' in tweet_df.columns:\n",
-    "                    for index, row in tweet_df.iterrows():\n",
-    "                        referenced_tweets = row['referenced_tweets']\n",
-    "                        \n",
-    "                        if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
-    "                            referenced_tweet = referenced_tweets[0]\n",
-    "                            referenced_tweet_type = referenced_tweet['type']\n",
-    "                            referenced_tweet_id = referenced_tweet['id']\n",
-    "                            \n",
-    "                            tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
-    "                            tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
-    "                \n",
-    "                ## Check if tweet-text contains keyword\n",
-    "                # if cond. because in some cases column doesn't exist\n",
-    "                if 'text' in tweet_df.columns:\n",
-    "                    tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
-    "                                                      .str.join(',')\n",
-    "                                                      .replace('', 'none'))\n",
-    "                \n",
-    "                ## Save two versions of the dataset, one with all fields and one without dict fields\n",
-    "                # define filepaths\n",
-    "                csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
-    "                csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
-    "                # save LONG csv\n",
-    "                tweet_df.to_csv(csv_path2)\n",
-    "                # Remove 'context_annotations', 'entities', and 'referenced_tweets' columns for short csv files\n",
-    "                # if cond. because in some cases column doesn't exist\n",
-    "                if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
-    "                    tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
-    "                # save short csv\n",
-    "                tweet_df.to_csv(csv_path)\n",
-    "                \n",
-    "                # break out of the retry loop since fetching tweets was successful\n",
-    "                break\n",
-    "            \n",
-    "            except tweepy.TweepError as e:\n",
-    "                if e.response.status_code == 429:  # rate limit exceeded\n",
-    "                    reset_time = int(e.response.headers['x-rate-limit-reset'])\n",
-    "                    wait_time = reset_time - time.time() + 5  # add additional 5 seconds as buffer\n",
-    "                    \n",
-    "                    print(f\"Rate limit exceeded. Sleeping for {wait_time} seconds.\")\n",
-    "                    time.sleep(wait_time)\n",
-    "                    \n",
-    "                    attempt += 1  # increment the attempt counter\n",
-    "                else:\n",
-    "                    print(f\"Error occurred: {e}\")\n",
-    "                    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "48688858-104d-4f2f-87b8-ed103f34b4e8",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Politics & Society",
-   "language": "python",
-   "name": "polsoc"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.2"
-  },
-  "toc-autonumbering": true,
-  "toc-showmarkdowntxt": false
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/collect.py
+++ b/collect.py
@ -4,9 +4,12 @@ Created on Thu Jun  8 01:08:21 2023

@author: Michael

-Following files are necessary:
-    config.py
-        Used to configure everything that's needed for this script.
+collect.py scrapes tweets from senators of the us that were in office between 
+2020 and the beginning of 2023.
+
+# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
+
+# Following files are necessary:
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N 
        slices. Is necessary due to possible blocking of requests by twitter. 
@ -15,26 +18,34 @@ Following files are necessary:
        Function deDupe reads each line of inFile and removes duplicate lines.
        A file outFile is saved without the duplicate lines. Generates 
        "keywords.txt".
-    data/keywords-raw.txt
+    funs/Scrape.py
+        scrapes using snscrape.modules.twitter. See docstring.
+    data/IN/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
-    data/senators-raw.csv
+    data/IN/senators-raw.csv
        Contains the senator dataset converted to csv. Is used to get the 
        account-names of all senators twitter accounts.

-Requirements:
+# Requirements:
    - snscrape 0.6.2.20230321+
    - pandas 2.0+
-The script will first import needed libraries. 
+# IMPORTANT:
 This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
 included in 'snscrape/' as a git repository for better reproducibility. Earlier
 versions of snscrape will most likely fail to scrape all tweets because of 
 certain rate limits or other errors that may occur.
-config.py will check whether snscrape is already installed. If not, it will try
-to install the included version automatically.
+Install snscrape from local git repo to make shure that it fits the used version.
+If snscrape is shall be installed from local repo, uncomment the following lines:

-How to use:
- To run the script, first adjust the config.py file. 
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+
+ 
+# How to use:
+- To run the script, first adjust the options found in the following lines.
 - config.py will check whether snscrape is already installed. If not, it will try
 to install the included version automatically. 
 - run the script
@ -55,26 +66,34 @@ which is the final output.
 import os
 import pandas as pd
 import glob
-import time
 import sys
 from datetime import datetime
 import concurrent.futures

-## Setup directories
+###################
+# Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

+# datafile input directory
+di = "data/IN/"
+
 # Tweet-datafile output directory
-td = "data/tweets/"
+td = "data/OUT/"

 # Name of file that all tweets will be written to
 file_alltweets = "ALL-SENATORS-TWEETS.csv"

+# don't change this one
 path_to_tweetdfs = wd + td

-## Define Timespan
+# Name of logfile
+logfile = f"{wd}log/log_"
+
+###################
+# Define Timespan & time-format
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
@ -86,19 +105,9 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000

-# Name of logfile
-logfile = wd+"log/log_"
-
-## Install snscrape from local git repo to make shure that it fits the used version.
-# If snscrape is already installed, uncomment the following lines:
-""" 
-import subprocess
-os.chdir('snscrape/')
-subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
-os.chdir(wd) 
-"""
-
-# Columns for tweet dataframe
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
+# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
+#   get subparams just like in user where user id can be obtained by user.id 
 tweetDFColumns = [
    "id",
    "user.id",
@ -135,18 +144,28 @@ tweetDFColumns = [
    "source",
 ]

-## Import other files
-from funs.TimeSlice import *
-from funs.ClearDupes import deDupe
-from funs.Scrape import scrapeTweets
+#############################################################################
+################## do NOT change anything below this line ###################
+#############################################################################

-# create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
-logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from TimeSlice import get_Tslices
+from ClearDupes import deDupe
+from Scrape import scrapeTweets
+
+################### 
+# Create logfile & log all outputs
+#   there are three logfile types to be found in /log.
+#   should be self explanatory.
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")

-## Create List of time-period-slices
+###################
+# Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
 print("Time-period-slices:")
@ -154,22 +173,25 @@ for slice in time_slices:
    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
 print("---")

-## Keywords
+###################
+# Keywords
+# read keywords from a file and write to list.
 keywords = []
 # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
-deDupe("data/keywords-raw.txt", "data/keywords.txt")
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
 # Read the keywords from a file
-with open("data/keywords.txt", "r") as file:
+with open(f"{di}keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
 print("---")

-## Senator Accounts
+###################
+# Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
-accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
-alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
+accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
+alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
 alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
 accounts.extend(alt_accounts)

@ -181,52 +203,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
        print("\n")
 print(f"\n{i} accounts in total.\n---")

-## Scraping
+###################
+# Scraping
+# report time:
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
 print(timeStartScrape.strftime(fTimeFormat))
 print("---")

 # Iterate over each Twitter account using multiprocessing
-# with concurrent.futures.ProcessPoolExecutor() as executor:
-#     # List to store the scraping tasks
-#     tasks = []
-#     for handle in accounts:
-#         # Iterate over each time slice
-#         for slice_data in time_slices:
-#             # ... Code to prepare the slice_data ...
-#             # Schedule the scraping task
-#             task = executor.submit(
-#                 scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
-#             )
-#             # Store the handle and slice_data as attributes of the task
-#     # Wait for all tasks to complete
-#     concurrent.futures.wait(tasks)
-    
+with concurrent.futures.ProcessPoolExecutor() as executor:
+    # List to store the scraping tasks
+    tasks = []
+    for handle in accounts:
+        # Iterate over each time slice
+        for slice_data in time_slices:
+            # ... Code to prepare the slice_data ...
+            # Schedule the scraping task
+            task = executor.submit(
+                scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix'] 
+            )
+            # Store the handle and slice_data as attributes of the task
+    # Wait for all tasks to complete
+    concurrent.futures.wait(tasks)
+
+# report time:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime(fTimeFormat))

-## Merge CSV-Files to file_alltweets.
-# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
-os.chdir(path_to_tweetdfs)
-# At first check, whether all slices are present.
-tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
+###################
+# Merge CSV-Files to file_alltweets.
+# fastest way is to save the slices seperately and then add every file to the 
+# output instead of using pandas or anything else.
+os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
+## At first check, whether all slices are present.
+tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") ???
+# Create list of all files that should be in the folder:
 AllFilesList = []
 for handle in accounts:
    for tslice in time_slices:
        suffix = tslice['suffix']
-        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
-with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
+        AllFilesList.append(f"Tweets-{handle}{suffix}.csv") 
+# report missing files to "log_*_missing.txt"
+with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
    for file in AllFilesList:
        if file not in tweetfiles:
            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
        else:
-            fout.write('all slices scraped.')
+            fout.write(f'{file:<30}:all slices scraped.\n')
            
-
-# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
+## Merge .csv files.
+# check if file_alltweets (previously scraped tweets that have been merged 
+# into one file) exists in tweetfiles list, if it exists, remove from list 
+# to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
@ -240,21 +271,26 @@ if tweetfiles:
            with open(file, "rb") as f:
                next(f)  # skip the header
                fout.write(f.read())
-os.chdir(wd)
+os.chdir(wd) # go back to wd

+################### 
+# finish logging
+# Report timing info.
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndMerge.strftime(fTimeFormat))
 print("---")
-tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
-tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
-tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
+# calulate times:
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
 print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 )
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")

+# close connection to logfiles.
 sys.stdout.close()
 sys.stderr.close()
--- a/collectSenData.py
+++ b/collectSenData.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun  23 21:49:11 2023
+
+@author: Michael
+
+collectSenData.py scrapes accounts of senators for the following data:the 
+number of followers, the number of users the twitter account is following, 
+and how long the twitter account has existed.
+
+# Requirements:
+    - snscrape 0.6.2.20230321+
+    - pandas 2.0+
+# IMPORTANT:
+This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
+included in 'snscrape/' as a git repository for better reproducibility. Earlier
+versions of snscrape will most likely fail to scrape all tweets because of 
+certain rate limits or other errors that may occur.
+Install snscrape from local git repo to make shure that it fits the used version.
+If snscrape is shall be installed from local repo, uncomment the following lines:
+
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+
+ 
+# How to use:
+"""
+
+import os
+import pandas as pd
+import glob
+import time
+import sys
+from datetime import datetime
+import concurrent.futures
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+
+# Name of logfile
+logfile = wd+"log/UserLog_"
+
+###################
+# Define Timespan & time-format
+# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
+ts_end = "2023-01-03T00:00:00Z"  # end of straping
+no_slices = 24  # Number of slices / time periods.
+
+# file time format
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+
+# Maximum tweets to be scraped by snscrape. Can be left untouched.
+maxTweets = 5000
+
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
+# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
+#   get subparams just like in user where user id can be obtained by user.id 
+userDFColumns = [
+    "id",
+    "username",
+    "followersCount",
+    "friendsCount",
+    "verified",
+    "created"
+]
+
+#############################################################################
+################## do NOT change anything below this line ###################
+#############################################################################
+
+from funs.Scrape import scrapeUsers, getHandles, printHandles
+from funs.TimeSlice import convertTime
+
+
+################### 
+# Create logfile & log all outputs
+#   there are three logfile types to be found in /log.
+#   should be self explanatory.
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
+sys.stderr = open(logfileErrors, "w")
+sys.stdout = open(logfilen, "w")
+
+
+###################
+# Senator Accounts
+# Get accounts & alt-accounts from Senators-Datafile
+accounts = getHandles(di)
+
+# Print accounts to be scraped
+print(printHandles(accounts))
+
+###################
+# Scraping
+# report time:
+timeStartScrape = datetime.now()
+print("Starting scraping at:")
+print(timeStartScrape.strftime(fTimeFormat))
+print("---")
+
+# Iterate over each Twitter account using multiprocessing
+listUsers = []
+# Iterate over each Twitter account using multiprocessing
+with concurrent.futures.ProcessPoolExecutor() as executor:
+    # List to store the scraping tasks
+    tasks = []
+    for handle in accounts:
+        # Schedule the scraping task
+        task = executor.submit(
+            scrapeUsers, handle, userDFColumns 
+        )
+        tasks.append(task)
+    
+    # Wait for all tasks to complete and retrieve results
+    for task in concurrent.futures.as_completed(tasks):
+        result = task.result()
+        listUsers.append(result)
+
+dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
+dfUsers.to_csv(senCSVPath, encoding='utf-8')
+
+# report time:
+timeEndScrape = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndScrape.strftime(fTimeFormat))
+
+# Report timing info.
+timeEndMerge = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndMerge.strftime(fTimeFormat))
+print("---")
+# calulate times:
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
+print(
+    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
+)
+print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
+print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
+
+print(listUsers)
+# close connection to logfiles.
+sys.stdout.close()
+sys.stderr.close()
--- a/createGraphs.py
+++ b/createGraphs.py
@ -0,0 +1,144 @@
+#%%
+#!/usr/bin/env python3
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
+import string
+#%%
+
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 26 20:36:43 2023
+
+@author: michael
+"""
+
+import pandas as pd
+# import pyreadstat
+# import numpy as np
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final.csv"
+senCSVcCov = "SenatorsTweets-OnlyCov.csv"
+
+# Outfiles
+wcAllTweetsF = "graphs/Wordcloud-All.png"
+wcCovTweetsF = "graphs/Wordcloud-Cov.png"
+TwCovTimeline = "graphs/Timeline.png"
+
+# don't change this one
+senCSVcPath = wd + ud + senCSVc
+senCSVcCovPath = wd + ud + senCSVcCov
+wcAllTweetsFPath = wd + ud + wcAllTweetsF
+wcCovTweetsFPath = wd + ud + wcCovTweetsF
+TwCovTimelinePath = wd + ud + TwCovTimeline
+
+#%%
+df = pd.read_csv(senCSVcPath, dtype=(object))
+dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
+#%%
+df['cleanContent'] = df['rawContent'].apply(remove_URL)
+df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
+df['cleanContent'] = df['cleanContent'].apply(remove_html)
+df['cleanContent'] = df['cleanContent'].apply(remove_punct)
+
+# create string with all cleaned tweets as text
+str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
+#%%
+dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
+
+# create string with all cleaned tweets as text
+str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
+#%%
+# replace single U and S characters
+str_covtweets = str_covtweets.replace(' u ', ' ') 
+str_covtweets = str_covtweets.replace(' s ', ' ') 
+str_alltweets = str_alltweets.replace(' u ', ' ') 
+str_alltweets = str_alltweets.replace(' s ', ' ') 
+
+
+# %%
+# create wordcloud alltweets
+wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
+wcA.generate(str_alltweets)
+
+#%%
+# draw
+plt.figure( figsize=(20,20))
+plt.axis("off")
+plt.imshow(wcA, interpolation="bilinear")
+fig1 = plt.gcf()
+plt.show()
+fig1.savefig(wcAllTweetsFPath)  
+
+# %%
+# create wordcloud covtweets
+wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
+wcC.generate(str_covtweets)
+#%%
+# draw
+plt.figure( figsize=(20,20))
+plt.axis("off")
+plt.imshow(wcC, interpolation="bilinear")
+fig2 = plt.gcf()
+plt.show()
+fig2.savefig(wcCovTweetsFPath)  
+# %%
+# with open('test.txt', 'w') as f:
+#    f.write(str_covtweets)
+# %%
+dfT = pd.DataFrame()
+dfT['date'] = df['date'].copy()
+dfT['count'] = 1
+
+dfCovT = pd.DataFrame()
+dfCovT['date'] = dfCov['date'].copy()
+dfCovT['count'] = 1
+#%%
+dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
+dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
+
+#%%
+dfT = dfT.groupby('date').count().reset_index()
+dfCovT = dfCovT.groupby('date').count().reset_index()
+
+#%%
+import matplotlib.dates as mdates
+# n of tweets overall
+my_dpi=300
+plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
+plt.style.use('seaborn-darkgrid')
+fig, ax = plt.subplots(figsize=(8, 6))
+ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
+ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
+ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
+ax.xaxis.set_minor_locator(mdates.MonthLocator())
+fig.autofmt_xdate()
+fig.savefig(TwCovTimelinePath)  
+
+
+# %%
--- a/data/IN/.gitkeep
+++ b/data/IN/.gitkeep
--- a/data/IN/counterKeywords.txt
+++ b/data/IN/counterKeywords.txt
@ -0,0 +1,23 @@
+opioid
+gun violence
+gun-violence
+CHD
+Coronary heart disease
+addiction
+tobacco
+vaping
+e-cigarette
+shooting
+indigenous women
+overdose
+meth
+cocaine
+separated children
+separating children
+separating families
+Muslim travel ban 
+flu-season
+flu season
+Soleimani
+Muslim Ban
+USMCA trade deal
--- a/data/IN/counterKeywordsFinal.txt
+++ b/data/IN/counterKeywordsFinal.txt
@ -0,0 +1,23 @@
+meth
+gun violence
+flu season
+vaping
+chd
+addiction
+indigenous women
+separating children
+tobacco
+e-cigarette
+muslim ban
+soleimani
+cocaine
+separating families
+muslim travel ban
+usmca trade deal
+shooting
+overdose
+separated children
+coronary heart disease
+gun-violence
+opioid
+flu-season
--- a/data/IN/keywords-raw.txt
+++ b/data/IN/keywords-raw.txt
--- a/data/IN/keywords.txt
+++ b/data/IN/keywords.txt
@ -1,41 +1,60 @@
-Coronavirus
-Koronavirus
-Corona
-CDC
-Wuhancoronavirus
-Wuhanlockdown
-Ncov
-Wuhan
-N95
-Kungflu 
-Epidemic
+plandemic
+scamdemic
+wuhan flu
+wuhanflu
+corona
+coronavirusoutbreak
+pandemic
+epidemic
+vax
+antivax
+antivaxxers
+wearamask
+masksoff
+cdc
+ncov
+sars-cov-2
+socialdistancing
+wear a mask
+lockdown
+covd
+coronavirus
+koronavirus
+corona
+cdc
+wuhancoronavirus
+wuhanlockdown
+ncov
+wuhan
+n95
+kungflu
+epidemic
 outbreak
-Sinophobia
-China
+sinophobia
 covid-19
 corona virus
 covid
 covid19
 sars-cov-2
-COVIDー19
-COVD
+covidー19
+covd
 pandemic
 coronapocalypse
 canceleverything
-Coronials
-SocialDistancingNow
-Social Distancing
-SocialDistancing
+coronials
+socialdistancingnow
+social distancing
+socialdistancing
 panicbuy
 panic buy
 panicbuying
 panic buying
-14DayQuarantine
-DuringMy14DayQuarantine       
+14dayquarantine
+duringmy14dayquarantine
 panic shop
 panic shopping
 panicshop
-InMyQuarantineSurvivalKit
+inmyquarantinesurvivalkit
 panic-buy
 panic-shop
 coronakindness
@ -45,27 +64,27 @@ chinesevirus
 stayhomechallenge
 stay home challenge
 sflockdown
-DontBeASpreader
+dontbeaspreader
 lockdown
 lock down
 shelteringinplace
 sheltering in place
-staysafestayhome 
+staysafestayhome
 stay safe stay home
 trumppandemic
 trump pandemic
-flattenthecurve 
+flattenthecurve
 flatten the curve
 china virus
 chinavirus
 quarentinelife
-PPEshortage
+ppeshortage
 saferathome
 stayathome
 stay at home
 stay home
 stayhome
-GetMePPE
+getmeppe
 covidiot
 epitwitter
 pandemie
@ -73,43 +92,72 @@ wear a mask
 wearamask
 kung flu
 covididiot
-COVID__19
+covid__19
 omicron
-variant 
+variant
 vaccine
 travel ban
 corona
+corona
 coronavirus
+coronavirus
+covid
+covid
+covid19
+covid19
+covid-19
+covid-19
+sarscov2
 sarscov2
 sars cov2
 sars cov 2
 covid_19
+covid_19
+ncov
 ncov
 ncov2019
+ncov2019
 2019-ncov
+2019-ncov
+pandemic
 pandemic 2019ncov
 2019ncov
 quarantine
+quarantine
+flatten the curve
 flattening the curve
 flatteningthecurve
 flattenthecurve
 hand sanitizer
 handsanitizer
+lockdown
+lockdown
 social distancing
 socialdistancing
 work from home
 workfromhome
 working from home
 workingfromhome
-ppe
 n95
+n95
+covidiots
 covidiots
 herd immunity
 herdimmunity
 pneumonia
+pneumonia
+chinese virus
+chinesevirus
 wuhan virus
 wuhanvirus
+kung flu
 kungflu
+wearamask
+wearamask
+wear a mask
+vaccine
+vaccines
+vaccine
 vaccines
 corona vaccine
 corona vaccines
@ -136,5 +184,7 @@ wash ur hands
 wash your hands
 washurhands
 washyourhands
+stayathome
+stayhome
 selfisolating
-self isolating
+self isolating
--- a/data/IN/own_keywords.txt
+++ b/data/IN/own_keywords.txt
@ -0,0 +1,20 @@
+plandemic
+scamdemic
+wuhan flu
+wuhanflu
+corona
+coronavirusoutbreak
+pandemic
+epidemic
+vax
+antivax
+antivaxxers
+wearamask
+masksoff
+cdc
+ncov
+sars-cov-2
+socialdistancing
+wear a mask
+lockdown
+covd
--- a/data/IN/pretest-tweets_fake.txt
+++ b/data/IN/pretest-tweets_fake.txt
@ -0,0 +1,50 @@
+1486474031419297799
+1504880316506263552
+1264663210197745665
+1479500294887256069
+1320058585590734852
+1539003407096336388
+1481704942574395392
+1572014646374154240
+1524764580806811649
+1592940763515858944
+1554529221594292224
+1479488991347023876
+1481715928492609541
+1476722414100914179
+1478478958740086790
+1459285859358982148
+1475620600228028432
+1479459200229117955
+1448386057339297797
+1468993886316077063 
+1448369102318362625 
+1444354461799956482
+1431340411193331715
+1583474056011010048
+1450479481278406658
+1396992539010469894
+1396992534623174658
+1417920232333656076
+1439553348122861568
+1598398871990079489
+1502768541979881479
+1337604370981134336
+1417797808707473410
+1601693432292192256
+1598145048989704192
+1599906362380591110
+1325851780496961538
+1468908159330885632
+1468332389923311616
+1339703372505624577
+1468633243654451200
+1488290848907444240
+1491146722625880064
+1481766558313730053
+1503078235373985795
+1485398845718773762 
+1371501907483754497
+1494398809245376513
+1436328255959801865
+1482862501461209089
--- a/data/IN/pretest-tweets_not_fake.txt
+++ b/data/IN/pretest-tweets_not_fake.txt
@ -0,0 +1,50 @@
+1258402212327436288
+1489758168750174209
+1303698927766646785
+1257681474670809090
+1340109389672411136
+1303698924444803072
+1303698926902665218
+1337595387796983809
+1344441446515019777
+1385680800218324992
+1590129838261956608
+1303698928609697796
+1348715183502454793
+1340418291274289153
+1421228572732280835
+1456349962942533637
+1603457599877308416
+1278354646885687296
+1340418294579421188
+1365866032792039425
+1472722005657112578
+1381021635772350464
+1337598897217220609
+1354797645261398016
+1266806429282963456
+1429847265242460161 
+1234272677633953792
+1301581247932772352
+1424832183148204043
+1339255967809212416
+1284831896988454912
+1463528081214394377 
+1453679912938885122
+1583474059148337152
+1519791965113622528
+1470775155110682628
+1464615554103357450
+1337595385565638657
+1436055743418019840
+1572208051830104069
+1433765113891328002
+1482774656075534336
+1310288545886736384
+1353845938566156289
+1396992537202659329
+1455712525362810883
+1340384267327647747
+1338588364459618305
+1376696928692412419
+1340386565399429123
--- a/data/IN/senators-raw.csv
+++ b/data/IN/senators-raw.csv
@ -1,112 +1,111 @@
-name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female, ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
-"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander ,LamarAlexander ,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
-"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi?lang=zh-Hant ,SenatorEnzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
+name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female,ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
+"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander,LamarAlexander,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
+"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi,senatorenzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
 "Gardner, Cory",3,Colorado,CO,0,2,0.719285383539398,01/06/2015,01/03/2021,5.9972602739726,1,116,48.5,46,2014,https://twitter.com/CoryGardner,CoryGardner,https://twitter.com/corygardner,corygardner,08/22/1974,0,White,8,"J.D.; University of Colorado, Boulder; 2001",2,N/A,https://bioguide.congress.gov/search/bio/G000562,,
 "Harris, Kamala",4,California ,CA,1,3,0.0213759569468058,01/03/2017,01/18/2021,4.04383561643836,1,116,62.4,37.6,2016,https://twitter.com/VP,VP,https://twitter.com/KamalaHarris,KamalaHarris,10/20/1964,1,African-American; Asian-American,8,J.D.; University of California; 1989,2,N/A,https://bioguide.congress.gov/search/bio/H001075,(became VP on jan 20 2021),
-"Isakson, John",5,Georgia,GA,0,3,*,01/03/2005,12/31/2019,14,1,116,55,40.8,2016,https://twitter.com/SenatorIsakson ,SenatorIsakson,N/A,N/A,12/28/1944,0,White,6,"University of Georgia, Athens; 1966",1,N/A,https://bioguide.congress.gov/search/bio/I000055,(died in 2019),
-"Jones, Gordon Douglas",6,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
-"Loeffler, Kelly",7,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler ,senatorloeffler ,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
-"McSally, Martha",8,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
-"Perdue, David",9,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
-"Roberts, Charles Patrick",10,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
-"Udall, Tom",11,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
-"Baldwin, Tammy",12,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
-"Barrasso, John",13,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
-"Bennet, Michael F.",14,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
-"Blackburn, Marsha",15,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
-"Blumenthal, Richard",16,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
-"Blunt, Roy",17,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
-"Booker, Cory A.",18,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
-"Boozman, John",19,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
-"Braun, Michael",20,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
-"Brown, Sherrod",21,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
-"Burr, Richard",22,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
-"Cantwell, Maria",23,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
-"Capito, Shelley Moore",24,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
-"Cardin, Benjamin L.",25,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
-"Carper, Thomas R.",26,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
-"Casey, Robert P., Jr.",27,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
-"Cassidy, Bill",28,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
-"Collins, Susan M.",29,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
-"Coons, Christopher A.",30,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
-"Cornyn, John",31,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary<72>s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
-"Cortez Masto, Catherine",32,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
-"Cotton, Tom",33,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
-"Cramer, Kevin",34,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
-"Crapo, Michael",35,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
-"Cruz, Ted",36,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
-"Daines, Steve",37,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
-"Duckworth, Tammy",38,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
-"Durbin, Richard J.",39,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
-"Ernst, Joni",40,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
-"Feinstein, Dianne",41,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
-"Fischer, Debra",42,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
-"Gillibrand, Kirsten E.",43,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
-"Graham, Lindsey",44,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
-"Grassley, Chuck",45,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
-"Hagerty, Bill",46,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
-"Hassan, Margaret Wood",47,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
-"Hawley, Josh",48,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
-"Heinrich, Martin",49,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,N/A,N/A,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
-"Hickenlooper, John W.",50,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
-"Hirono, Mazie K.",51,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
-"Hoeven, John",52,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
-"Hyde-Smith, Cindy",53,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
-"Inhofe, James",54,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
-"Johnson, Ron",55,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
-"Kaine, Tim",56,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
-"Kelly, Mark",57,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
-"Kennedy, John Neely",58,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
-"King, Angus S., Jr.",59,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
-"Klobuchar, Amy",60,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
-"Lankford, James",61,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
-"Leahy, Patrick",62,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
-"Lee, Mike",63,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
-"Luj<EFBFBD>n, Ben Ray",64,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
-"Lummis, Cynthia M.",65,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
-"Manchin, Joe, III",66,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
-"Markey, Edward J.",67,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
-"Marshall, Roger",68,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
-"McConnell, Mitch",69,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
-"Menendez, Robert",70,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
-"Merkley, Jeff",71,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
-"Moran, Jerry",72,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
-"Murkowski, Lisa",73,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
-"Murphy, Christopher",74,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
-"Murray, Patty",75,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
-"Ossoff, Jon",76,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
-"Padilla, Alex",77,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
-"Paul, Rand",78,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
-"Peters, Gary C.",79,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
-"Portman, Robert",80,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
-"Reed, John F.",81,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
-"Risch, James E.",82,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
-"Romney, Mitt",83,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
-"Rosen, Jacky",84,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
-"Rounds, Mike",85,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
-"Rubio, Marco",86,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
-"Sanders, Bernard",87,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
-"Sasse, Benjamin",88,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
-"Schatz, Brian",89,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
-"Schumer, Charles E.",90,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
-"Scott, Rick",91,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
-"Scott, Tim",92,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
-"Shaheen, Jeanne",93,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
-"Shelby, Richard",94,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
-"Sinema, Kyrsten",95,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
-"Smith, Tina",96,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
-"Stabenow, Debbie",97,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
-"Sullivan, Dan",98,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
-"Tester, Jon",99,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
-"Thune, John",100,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
-"Tillis, Thom",101,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
-"Toomey, Patrick",102,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
-"Tuberville, Tommy",103,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
-"Van Hollen, Chris",104,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
-"Warner, Mark R.",105,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
-"Warnock, Raphael G.",106,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
-"Warren, Elizabeth",107,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
-"Whitehouse, Sheldon",108,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
-"Wicker, Roger F.",109,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
-"Wyden, Ron",110,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
-"Young, Todd",111,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
+"Jones, Gordon Douglas",5,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
+"Loeffler, Kelly",6,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler,senatorloeffler,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
+"McSally, Martha",7,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
+"Perdue, David",8,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
+"Roberts, Charles Patrick",9,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
+"Udall, Tom",10,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
+"Baldwin, Tammy",11,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
+"Barrasso, John",12,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
+"Bennet, Michael F.",13,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
+"Blackburn, Marsha",14,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
+"Blumenthal, Richard",15,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
+"Blunt, Roy",16,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
+"Booker, Cory A.",17,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
+"Boozman, John",18,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
+"Braun, Michael",19,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
+"Brown, Sherrod",20,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
+"Burr, Richard",21,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
+"Cantwell, Maria",22,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
+"Capito, Shelley Moore",23,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
+"Cardin, Benjamin L.",24,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
+"Carper, Thomas R.",25,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
+"Casey, Robert P., Jr.",26,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
+"Cassidy, Bill",27,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
+"Collins, Susan M.",28,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
+"Coons, Christopher A.",29,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
+"Cornyn, John",30,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary’s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
+"Cortez Masto, Catherine",31,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
+"Cotton, Tom",32,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
+"Cramer, Kevin",33,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
+"Crapo, Michael",34,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
+"Cruz, Ted",35,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
+"Daines, Steve",36,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
+"Duckworth, Tammy",37,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
+"Durbin, Richard J.",38,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
+"Ernst, Joni",39,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
+"Feinstein, Dianne",40,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
+"Fischer, Debra",41,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
+"Gillibrand, Kirsten E.",42,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
+"Graham, Lindsey",43,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
+"Grassley, Chuck",44,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
+"Hagerty, Bill",45,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
+"Hassan, Margaret Wood",46,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
+"Hawley, Josh",47,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
+"Heinrich, Martin",48,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,https://twitter.com/senatorheinrich,senatorheinrich,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
+"Hickenlooper, John W.",49,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
+"Hirono, Mazie K.",50,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
+"Hoeven, John",51,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
+"Hyde-Smith, Cindy",52,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
+"Inhofe, James",53,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
+"Johnson, Ron",54,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
+"Kaine, Tim",55,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
+"Kelly, Mark",56,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
+"Kennedy, John Neely",57,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
+"King, Angus S., Jr.",58,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
+"Klobuchar, Amy",59,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
+"Lankford, James",60,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
+"Leahy, Patrick",61,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
+"Lee, Mike",62,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
+"Luján, Ben Ray",63,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
+"Lummis, Cynthia M.",64,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
+"Manchin, Joe, III",65,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
+"Markey, Edward J.",66,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
+"Marshall, Roger",67,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
+"McConnell, Mitch",68,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
+"Menendez, Robert",69,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
+"Merkley, Jeff",70,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
+"Moran, Jerry",71,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
+"Murkowski, Lisa",72,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
+"Murphy, Christopher",73,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
+"Murray, Patty",74,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
+"Ossoff, Jon",75,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
+"Padilla, Alex",76,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
+"Paul, Rand",77,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
+"Peters, Gary C.",78,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
+"Portman, Robert",79,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
+"Reed, John F.",80,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
+"Risch, James E.",81,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
+"Romney, Mitt",82,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
+"Rosen, Jacky",83,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
+"Rounds, Mike",84,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
+"Rubio, Marco",85,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
+"Sanders, Bernard",86,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
+"Sasse, Benjamin",87,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
+"Schatz, Brian",88,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
+"Schumer, Charles E.",89,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
+"Scott, Rick",90,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
+"Scott, Tim",91,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
+"Shaheen, Jeanne",92,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
+"Shelby, Richard",93,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
+"Sinema, Kyrsten",94,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
+"Smith, Tina",95,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
+"Stabenow, Debbie",96,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
+"Sullivan, Dan",97,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
+"Tester, Jon",98,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
+"Thune, John",99,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
+"Tillis, Thom",100,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
+"Toomey, Patrick",101,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
+"Tuberville, Tommy",102,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
+"Van Hollen, Chris",103,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
+"Warner, Mark R.",104,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
+"Warnock, Raphael G.",105,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
+"Warren, Elizabeth",106,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
+"Whitehouse, Sheldon",107,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
+"Wicker, Roger F.",108,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
+"Wyden, Ron",109,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
+"Young, Todd",110,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
--- a/data/OUT/.gitignore
+++ b/data/OUT/.gitignore
@ -0,0 +1,8 @@
+/ALL-SENATORS-TWEETS.csv
+/Pretest-Prep.csv
+/Pretest-Results.csv
+/Pretest-SENATORS-TWEETS.csv
+/SenatorsTweets-Final.csv
+/SenatorsTweets-OnlyCov.csv
+/Tweets-Classified-Prep.csv
+/Tweets-Stub.csv
--- a/data/OUT/.gitkeep
+++ b/data/OUT/.gitkeep
--- a/data/OUT/graphs/.gitignore
+++ b/data/OUT/graphs/.gitignore
@ -0,0 +1,3 @@
+/Timeline.png
+/Wordcloud-All.png
+/Wordcloud-Cov.png
--- a/data/tweets/.gitignore
+++ b/data/tweets/.gitignore
@ -1,24 +0,0 @@
-/ALL-SENATORS-LONG-LONG.csv
-/ALL-SENATORS.csv
-/CoryGardner-LONG.csv
-/CoryGardner.csv
-/DavidPerdueGA-LONG.csv
-/DavidPerdueGA.csv
-/DougJones-LONG.csv
-/DougJones.csv
-/KLoeffler-LONG.csv
-/KLoeffler.csv
-/MarthaMcSallyAZ-LONG.csv
-/MarthaMcSallyAZ.csv
-/SenAlexander-LONG.csv
-/SenAlexander.csv
-/SenPatRoberts-LONG.csv
-/SenPatRoberts.csv
-/SenatorEnzi-LONG.csv
-/SenatorEnzi.csv
-/SenatorIsakson-LONG.csv
-/SenatorIsakson.csv
-/SenatorTomUdall-LONG.csv
-/SenatorTomUdall.csv
-/VP-LONG.csv
-/VP.csv
--- a/funs/CleanTweets.py
+++ b/funs/CleanTweets.py
@ -0,0 +1,89 @@
+import re
+import string
+
+def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
+    preprocessed_text = []
+    for t in text.split():
+        if len(t) > 1:
+            t = '@user' if t[0] == '@' and t.count('@') == 1 else t
+            t = 'http' if t.startswith('http') else t
+        preprocessed_text.append(t)
+    return ' '.join(preprocessed_text)
+
+def remove_URL(text):
+    try: 
+        url = re.compile(r'https?://\S+|www\.\S+')
+    except: print(text)
+    return url.sub(r'', text)
+
+def remove_emoji(text):
+    emoji_pattern = re.compile(
+        '['
+        u'\U0001F600-\U0001F64F'  # emoticons
+        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
+        u'\U0001F680-\U0001F6FF'  # transport & map symbols
+        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
+        u'\U00002702-\U000027B0'
+        u'\U000024C2-\U0001F251'
+        ']+',
+        flags=re.UNICODE)
+    return emoji_pattern.sub(r'', text)
+
+def remove_html(text):
+    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
+    return re.sub(html, '', text)
+
+def remove_punct(text):
+    table = str.maketrans('', '', string.punctuation)
+    return text.translate(table)
+
+def remove_nonascii(text):
+    return re.sub(r'[^\x00-\x7F]+', '', text)
+
+def remove_spec(text):
+    text = re.sub(r'&amp;?', r'and', text)
+    text = re.sub(r'&lt;', r'<', text)
+    return re.sub(r'&gt;', r'>', text)
+
+def remove_spaces(text): # also new line chars and to lower case
+    text = re.sub(r'&lt;', r'<', text)
+    text = " ".join(text.splitlines()) # remove newline characters
+    text = text.lower()
+    text = text.strip()
+    return re.sub(r'\s{2,}', ' ', text)
+
+def remove_retw(text):
+    text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
+    return re.sub(r'@[\S]+', '', text)
+
+def preprocess_text(text):
+    text = remove_URL(text)
+    text = remove_emoji(text)
+    text = remove_html(text)
+    text = remove_punct(text)
+    text = remove_nonascii(text)
+    text = remove_spec(text)
+    text = remove_spaces(text)
+    text = remove_retw(text)
+    return text
+
+def preprocess_text_series(series):
+    series = series.apply(remove_URL)
+    series = series.apply(remove_emoji)
+    series = series.apply(remove_html)
+    series = series.apply(remove_punct)
+    series = series.apply(remove_nonascii)
+    series = series.apply(remove_spec)
+    series = series.apply(remove_spaces)
+    series = series.apply(remove_retw)
+    return series
+
+# Check all functions:
+input_text = """
+    Check out this amazing website: https://www.example.com! 😃
+    <html>This is an HTML tag.</html>
+    RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
+    This is a test text with lots of punctuations!!! Can't wait to see more...
+"""
+processed_text = preprocess_text(input_text)
+# print(processed_text)
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@ -3,13 +3,22 @@ import time
 import pandas as pd
 import snscrape.modules.twitter as sntwitter

-def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
+def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,  maxTweets = 5000):
+    """Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
+
+    Args:
+        handle (str): twitter handle of account to be scraped
+        keywords (list): list of strings containing the keywords that the tweets shall be searched for
+        td (str): tweet file output path
+        tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
+        ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+        ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+        suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
+        maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
+    """
    i = 0
    
    currentTime = datetime.now()
-    ts_beg = slice_data['beg_time']
-    ts_end = slice_data['end_time']
-    suffix = slice_data['suffix']
    tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
    
    # create empty tweetlist that will be filled with tweets of current sen
@ -54,4 +63,55 @@ def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5
    # save short csv
    tweet_df.to_csv(csv_path, encoding='utf-8')
    # sleep 1 second to not get blocked because of excessive requests
-    time.sleep(0.5)
+    time.sleep(0.5)
+
+def getHandles(di):
+    """grabs accounts from senators-raw.csv
+
+    Args:
+        di (str): path to senators-raw.csv
+
+    Returns:
+        list: list containing str of senator account handles
+    """
+    accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
+    alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
+    alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
+    accounts.extend(alt_accounts)
+    return accounts
+
+def printHandles(accounts):
+    """returns string with all accounts in a readable way.
+
+    Args:
+        accounts (list): list of str with handles
+
+    Returns:
+        str: containing text that can be written to txtfile
+    """
+    txt = ["Accounts to be scraped:\n"]
+    for i, acc in enumerate(accounts): # print 5 accounts per line
+        txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
+        if i % 5 == 4: 
+            txt.append(" \n")
+    txt.append(f"\n{i} accounts in total.")
+    return ''.join(txt)
+
+def scrapeUsers(handle, userDFColumns, maxTweets=1):
+    currentTime = datetime.now()
+    userList = []
+    print(f'{currentTime:<30} Fetching: {handle:>15}')
+    query = f'from:{handle}'
+    
+    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
+        if i > maxTweets:
+            break
+        # Get user data and append to singleUserList
+        userList = []
+        for col in userDFColumns:
+            singleUser = eval(f'tweet.user.{col}') 
+            userList.append(singleUser)
+            
+    # Create dataframe using userList and userDFColumns
+    #df = pd.DataFrame(userList, columns=userDFColumns)
+    return userList
--- a/funs/TimeSlice.py
+++ b/funs/TimeSlice.py
@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023

 # create slices 
 def get_Tslices(ts_beg, ts_end, no_slices):
+    """Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
+
+    Args:
+        ts_beg (datetime): Datetime start of overall period to be sliced.
+        ts_end (datetime): Datetime end of overall period to be sliced.
+        no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
+
+    Returns:
+        list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
+    """
    from datetime import datetime
    from datetime import timedelta
    ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):

 # For log time conversions (seconds to days, hours, minutes)
 def convertTime(duration):
+    """Converts seconds to hours, minutes and seconds.
+
+    Args:
+        duration (int): seconds
+
+    Returns:
+        int: hours
+        int: minutes
+        int: seconds
+    """
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
--- a/log/.gitkeep
+++ b/log/.gitkeep
--- a/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
+++ b/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
+2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
+3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
+4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
+5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
+6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
--- a/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
+++ b/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
+2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
+3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
+4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
+5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
+6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
--- a/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
+++ b/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
+2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
+3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
+4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
+5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
+6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
--- a/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
+++ b/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
+2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
+3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
+4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
+5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
+6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
--- a/preTestClassification.py
+++ b/preTestClassification.py
@ -0,0 +1,135 @@
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+#%%
+# prepare
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of new datafile generated
+senCSVc = "Tweets-Stub.csv"
+
+# Name of pretest files
+preTestIDsFake = "pretest-tweets_fake.txt"
+preTestIDsNot = "pretest-tweets_not_fake.txt"
+
+# Name of pretest datafile
+senCSVPretest = "Pretest.csv"
+senCSVPretestPrep = "Pretest-Prep.csv"
+senCSVPretestResult = "Pretest-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc
+senCSVcPretestPath = wd + ud + senCSVPretest
+senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
+senCSVcPretestResultPath = wd + ud + senCSVPretestResult
+preTestIDsFakePath = wd + di + preTestIDsFake
+preTestIDsNotPath = wd + di + preTestIDsNot
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# List of IDs to select
+# Read the IDs from a file
+preTestIDsFakeL = []
+preTestIDsNotL  = []
+with open(preTestIDsFakePath, "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        tid = line.strip()  # Remove the newline character
+        preTestIDsFakeL.append(tid)
+with open(preTestIDsNotPath, "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        tid = line.strip()  # Remove the newline character
+        preTestIDsNotL.append(tid)
+
+# Select rows based on the IDs
+df = pd.read_csv(senCSVPath, dtype=(object))
+#%%
+# Create pretest dataframe
+dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
+dfPreTest['fake'] = True
+dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
+dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
+model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
+
+# %%
+results = pipe(KeyDataset(dataset, "text"))
+# %%
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfPreTest['output_label'] = output_labels
+dfPreTest['output_score'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
+
+# %%
+dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
+
+# %%
--- a/profiler.py
+++ b/profiler.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  8 14:49:02 2023
+
+@author: michael
+"""
+
+import pandas as pd
+import pandas_profiling as pp
+import numpy
+  
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final"
+senCSVcCov = "SenatorsTweets-OnlyCov"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc + ".csv"
+senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
+senSAVcPath = wd + ud + senCSV + ".sav"
+senDTAcPath = wd + ud + senCSV + ".dta"
+senDatasetPath = wd + di + senDataset
+  
+# forming dataframe and printing
+df = pd.read_csv(senCSVPath, dtype=(object))
+  
+# forming ProfileReport and save
+# as output.html file
+profileAll = pp.ProfileReport(df, minimal=True)
+profileAll.to_file("data/OUT/profiles/AllTweets.html")
+
+df = pd.read_csv(senCSVcCovPath, dtype=(object))
+
+profileAll = pp.ProfileReport(df, minimal=True)
+profileAll.to_file("data/OUT/profiles/CovTweets.html")
--- a/repairmystupidity.py
+++ b/repairmystupidity.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 14 20:47:22 2023
+
+@author: michael
+"""
+import pandas as pd
+
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
+richtig = wd + ud + "SenatorsTweets-Training.csv"
+correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
+
+# Name of new datafile generated
+senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
+
+# don't change this one
+falsch = pd.read_csv(falsch, dtype=(object), sep=";")
+richtig = pd.read_csv(richtig, dtype=(object))
+
+df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
+df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
+df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
+df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
+df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
+
+df.to_csv(correct, encoding='utf-8', sep=";")
--- a/trainFake.py
+++ b/trainFake.py
@ -0,0 +1,613 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 12 12:25:18 2023
+
+@author: michael
+"""
+#from datasets import load_dataset
+#from transformers import Trainer
+#from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import torch 
+import numpy as np
+from sklearn.model_selection import train_test_split # pip install scikit-learn
+
+import pandas as pd
+
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Training CSV dataset
+twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
+twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
+twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
+statsTrainingTopicClass = "statsTopicClassification-"
+
+# don't change this one
+twtCSVPath = wd + ud + twtCSV + ".csv"
+twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
+twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
+
+statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
+
+twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
+twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
+twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
+twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
+
+twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
+twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
+
+seed = 12355
+
+# Model paths
+modCovClassPath = wd + "models/CovClass/"
+modFakeClassPath = wd + "models/FakeClass/"
+
+model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
+#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
+#model_name = "cardiffnlp/tweet-topic-latest-multi"
+model_name = "bvrau/covid-twitter-bert-v2-struth"
+#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
+model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
+
+# More models for fake detection:
+# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+max_length = 64 # max token sentence length
+
+#%%
+# Create training and testing dataset
+dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
+
+#dfTest = dfTest[:-900] # remove last 800 rows
+#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
+
+dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
+
+dfTest.drop(columns=['rawContent'], inplace=True)
+
+# Only keep tweets that are longer than 3 words
+dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
+dfTest['tweet_proc_length'].value_counts()
+dfTest = dfTest[dfTest['tweet_proc_length']>3]
+dfTest = dfTest.drop_duplicates(subset=['text'])
+dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
+
+# Create datasets for each classification
+dfCovClass = dfTest
+dfFakeClass = dfTest
+dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
+dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
+
+#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
+dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
+dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
+
+#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
+dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
+
+#%%
+# Tokenize tweets
+dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
+dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
+dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
+dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+
+def encode_labels(label):
+    if label == 'Covid':
+        return 1
+    elif label == 'NonCovid':
+        return 0
+    elif label == 'False':
+        return 1
+    elif label == 'True':
+        return 0
+    return 0
+dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
+dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
+dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
+#dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
+
+# get n of classes
+print("# of Non-Covid tweets (coded 0):")
+print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
+
+print("# of Fake-news tweets (coded 1):")
+print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+
+# create disproportionate sample - 50/50 of both
+#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
+#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
+# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
+# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
+
+'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
+dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
+dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
+dfCovClassab.reset_index(inplace=True)
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
+'''
+
+# create training and validation samples
+dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
+
+# reset index and drop unnecessary columns
+dfFakeClass_train.reset_index(drop=True, inplace=True)
+dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
+dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+dfFakeClass_test.reset_index(drop=True, inplace=True)
+dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
+dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+# save dfs as csvs and tsvs, for training and validation
+# covid classification datafiles
+# rows 0-41 = noncovid, 42-81 covid, therfore:
+#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
+#dfCovClass.reset_index(inplace=True, drop=True)
+#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
+#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
+
+# fake news classification datafiles
+#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
+
+#%%
+# Prepare trainer
+#from transformers import TrainingArguments
+
+#training_args = TrainingArguments(
+#     report_to = 'wandb',
+#    output_dir=wd+'results',          # output directory/
+#   overwrite_output_dir = True,
+#    num_train_epochs=6,              # total number of training epochs
+#    per_device_train_batch_size=8,  # batch size per device during training
+#    per_device_eval_batch_size=16,   # batch size for evaluation
+#    learning_rate=2e-5,
+#    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
+#    weight_decay=0.01,               # strength of weight decay
+#    logging_dir='./logs3',            # directory for storing logs
+#    logging_steps=1000,
+#    evaluation_strategy="epoch",
+#    save_strategy="epoch",
+#    load_best_model_at_end=True
+#)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+from transformers import BertForSequenceClassification, AdamW#, BertConfig
+#from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+
+"""
+train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
+train_dataset = train_dataset['train'] 
+eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
+eval_dataset = eval_dataset['test'] 
+"""
+batch_size = 1
+
+from torch.utils.data import Dataset
+
+class PandasDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length):
+        self.dataframe = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, index):
+        row = self.dataframe.iloc[index]
+        text = row['text']
+        labels = row['labels_encoded']
+        
+        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
+        input_ids = torch.tensor(encoded['input_ids'])
+        attention_mask = torch.tensor(encoded['attention_mask'])
+        
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(labels)  # Assuming labels are already encoded
+        }
+
+
+train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
+train_dataloader = DataLoader(
+    train_dataset,
+    sampler=RandomSampler(train_dataset),
+    batch_size=batch_size
+)
+
+eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
+validation_dataloader = DataLoader(
+    eval_dataset,
+    sampler=SequentialSampler(eval_dataset),
+    batch_size=batch_size
+)
+
+for idx, batch in enumerate(train_dataloader):
+    print('Batch index: ', idx)
+    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
+    print('Batch label: ', batch['labels'])           # Access 'labels' field
+    break
+
+model = BertForSequenceClassification.from_pretrained(
+    model_name,
+    num_labels = 2, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.   
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+#trainer = Trainer(
+#    model=model,                         # the instantiated 🤗 Transformers model to be trained
+#    args=training_args,                  # training arguments, defined above
+#    train_dataset=train_dataset,         # training dataset
+#    eval_dataset=eval_dataset             # evaluation dataset
+#)
+
+
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+from transformers import get_linear_schedule_with_warmup
+
+# Number of training epochs. The BERT authors recommend between 2 and 4. 
+# We chose to run for 6
+epochs = 6
+
+# Total number of training steps is [number of batches] x [number of epochs]. 
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+import time
+import datetime
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+import random
+
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 12355
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+    #model.cuda()
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+device = torch.device("cpu")
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+#%%
+# Start training
+# We'll store a number of quantities such as training and validation loss, 
+# validation accuracy, and timings.
+training_stats = []
+
+# Measure the total training time for the whole run.
+total_t0 = time.time()
+
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
+    print('Training...')
+    
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    model.to(device)
+    # Reset the total loss for this epoch.
+    total_train_loss = 0
+    # Put the model into training mode. Don't be mislead--the call to 
+    # `train` just changes the *mode*, it doesn't *perform* the training.
+    # `dropout` and `batchnorm` layers behave differently during training
+    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 10 batches.
+        if step % 10 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        print("Batch keys:", batch.keys())
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because 
+        # accumulating the gradients is "convenient while training RNNs". 
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()        
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # The documentation for this `model` function is here: 
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        # It returns different numbers of parameters depending on what arguments
+        # arge given and what flags are set. For our useage here, it returns
+        # the loss (because we provided labels) and the "logits"--the model
+        # outputs prior to activation.
+        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+        loss = output[0]
+        logits = output[1]
+
+        # Accumulate the training loss over all of the batches so that we can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value 
+        # from the tensor.
+        total_train_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)            
+    
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epcoh took: {:}".format(training_time))
+        
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure our performance on
+    # our validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables 
+    total_eval_accuracy = 0
+    total_eval_loss = 0
+    nb_eval_steps = 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using 
+        # the `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+        
+        # Tell pytorch not to bother with constructing the compute graph during
+        # the forward pass, since this is only needed for backprop (training).
+        with torch.no_grad():        
+
+            # Forward pass, calculate logit predictions.
+            # token_type_ids is the same as the "segment ids", which 
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here: 
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+            loss = output[0]
+            logits = output[1]
+            
+        # Accumulate the validation loss.
+        total_eval_loss += loss.item()
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+
+        # Calculate the accuracy for this batch of test sentences, and
+        # accumulate it over all batches.
+        total_eval_accuracy += flat_accuracy(logits, label_ids)
+        
+
+    # Report the final accuracy for this validation run.
+    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
+    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
+
+    # Calculate the average loss over all of the batches.
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    
+    # Measure how long the validation run took.
+    validation_time = format_time(time.time() - t0)
+    
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Valid. Accur.': avg_val_accuracy,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+
+print("")
+print("Training complete!")
+
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+
+params = list(model.named_parameters())
+
+print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+print('==== Embedding Layer ====\n')
+
+for p in params[0:5]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== First Transformer ====\n')
+
+for p in params[5:21]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== Output Layer ====\n')
+
+for p in params[-4:]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+
+import os
+
+# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+from datetime import datetime as dt
+
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+now = dt.now().strftime(fTimeFormat)
+
+output_dir = modFakeClassPath + now + "/"
+
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+print("Saving model to %s" % output_dir)
+
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+model_to_save.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+
+# Good practice: save your training arguments together with the trained model
+# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+
+import pandas as pd
+
+# Display floats with two decimal places.
+pd.set_option('display.precision', 2)
+
+# Create a DataFrame from our training statistics.
+df_stats = pd.DataFrame(data=training_stats)
+
+# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
+df_stats = df_stats.set_index('epoch')
+
+# A hack to force the column headers to wrap.
+#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+
+
+# Display the table.
+df_stats
+df_stats.to_csv(output_dir + now + ".csv")
--- a/trainTopic.py
+++ b/trainTopic.py
@ -0,0 +1,607 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 12 12:25:18 2023
+
+@author: michael
+"""
+#from datasets import load_dataset
+#from transformers import Trainer
+#from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import torch 
+import numpy as np
+from sklearn.model_selection import train_test_split # pip install scikit-learn
+
+import pandas as pd
+
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Training CSV dataset
+twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
+twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
+twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
+statsTrainingTopicClass = "statsTopicClassification-"
+
+# don't change this one
+twtCSVPath = wd + ud + twtCSV + ".csv"
+twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
+twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
+
+statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
+
+twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
+twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
+twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
+twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
+
+twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
+twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
+
+seed = 12355
+
+# Model paths
+modCovClassPath = wd + "models/CovClass/"
+modFakeClassPath = wd + "models/FakeClass/"
+
+model_name = "bvrau/covid-twitter-bert-v2-struth"
+model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
+
+# More models for fake detection:
+# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+max_length = 64 # max token sentence length
+
+#%%
+# Create training and testing dataset
+dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
+
+#dfTest = dfTest[:-900] # remove last 800 rows
+#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
+
+dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
+
+dfTest.drop(columns=['rawContent'], inplace=True)
+
+# Only keep tweets that are longer than 3 words
+dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
+dfTest['tweet_proc_length'].value_counts()
+dfTest = dfTest[dfTest['tweet_proc_length']>3]
+dfTest = dfTest.drop_duplicates(subset=['text'])
+dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
+
+# Create datasets for each classification
+dfCovClass = dfTest
+dfFakeClass = dfTest
+dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
+dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
+
+#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
+dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
+dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
+
+#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
+dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
+dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
+
+#%%
+# Tokenize tweets
+dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
+dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
+dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+
+def encode_labels(label):
+    if label == 'Covid':
+        return 1
+    elif label == 'NonCovid':
+        return 0
+    elif label == 'Fake':
+        return 1
+    elif label == 'True':
+        return 0
+    return 0
+dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
+dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
+
+# get n of classes
+print("# of Non-Covid tweets (coded 0):")
+print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
+
+print("# of Fake-news tweets (coded 1):")
+print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+
+# create disproportionate sample - 50/50 of both
+#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
+#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
+# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
+# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
+
+'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
+dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
+dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
+dfCovClassab.reset_index(inplace=True)
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
+'''
+
+# create training and validation samples
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
+
+# reset index and drop unnecessary columns
+dfCovClass_train.reset_index(drop=True, inplace=True)
+dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
+dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+dfCovClass_test.reset_index(drop=True, inplace=True)
+dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
+dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+# save dfs as csvs and tsvs, for training and validation
+# covid classification datafiles
+# rows 0-41 = noncovid, 42-81 covid, therfore:
+#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
+#dfCovClass.reset_index(inplace=True, drop=True)
+#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
+#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
+
+# fake news classification datafiles
+#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
+
+#%%
+# Prepare trainer
+#from transformers import TrainingArguments
+
+#training_args = TrainingArguments(
+#     report_to = 'wandb',
+#    output_dir=wd+'results',          # output directory/
+#   overwrite_output_dir = True,
+#    num_train_epochs=6,              # total number of training epochs
+#    per_device_train_batch_size=8,  # batch size per device during training
+#    per_device_eval_batch_size=16,   # batch size for evaluation
+#    learning_rate=2e-5,
+#    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
+#    weight_decay=0.01,               # strength of weight decay
+#    logging_dir='./logs3',            # directory for storing logs
+#    logging_steps=1000,
+#    evaluation_strategy="epoch",
+#    save_strategy="epoch",
+#    load_best_model_at_end=True
+#)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+from transformers import BertForSequenceClassification, AdamW#, BertConfig
+#from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+
+"""
+train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
+train_dataset = train_dataset['train'] 
+eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
+eval_dataset = eval_dataset['test'] 
+"""
+batch_size = 1
+
+from torch.utils.data import Dataset
+
+class PandasDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length):
+        self.dataframe = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, index):
+        row = self.dataframe.iloc[index]
+        text = row['text']
+        labels = row['labels_encoded']
+        
+        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
+        input_ids = torch.tensor(encoded['input_ids'])
+        attention_mask = torch.tensor(encoded['attention_mask'])
+        
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(labels)  # Assuming labels are already encoded
+        }
+
+
+train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
+train_dataloader = DataLoader(
+    train_dataset,
+    sampler=RandomSampler(train_dataset),
+    batch_size=batch_size
+)
+
+eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
+validation_dataloader = DataLoader(
+    eval_dataset,
+    sampler=SequentialSampler(eval_dataset),
+    batch_size=batch_size
+)
+
+for idx, batch in enumerate(train_dataloader):
+    print('Batch index: ', idx)
+    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
+    print('Batch label: ', batch['labels'])           # Access 'labels' field
+    break
+
+model = BertForSequenceClassification.from_pretrained(
+    model_name,
+    num_labels = 2, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.   
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+#trainer = Trainer(
+#    model=model,                         # the instantiated 🤗 Transformers model to be trained
+#    args=training_args,                  # training arguments, defined above
+#    train_dataset=train_dataset,         # training dataset
+#    eval_dataset=eval_dataset             # evaluation dataset
+#)
+
+
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+from transformers import get_linear_schedule_with_warmup
+
+# Number of training epochs. The BERT authors recommend between 2 and 4. 
+# We chose to run for 6
+epochs = 6
+
+# Total number of training steps is [number of batches] x [number of epochs]. 
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+import time
+import datetime
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+import random
+
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 12355
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+    #model.cuda()
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+device = torch.device("cpu")
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+#%%
+# Start training
+# We'll store a number of quantities such as training and validation loss, 
+# validation accuracy, and timings.
+training_stats = []
+
+# Measure the total training time for the whole run.
+total_t0 = time.time()
+
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
+    print('Training...')
+    
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    model.to(device)
+    # Reset the total loss for this epoch.
+    total_train_loss = 0
+    # Put the model into training mode. Don't be mislead--the call to 
+    # `train` just changes the *mode*, it doesn't *perform* the training.
+    # `dropout` and `batchnorm` layers behave differently during training
+    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 10 batches.
+        if step % 10 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        print("Batch keys:", batch.keys())
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because 
+        # accumulating the gradients is "convenient while training RNNs". 
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()        
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # The documentation for this `model` function is here: 
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        # It returns different numbers of parameters depending on what arguments
+        # arge given and what flags are set. For our useage here, it returns
+        # the loss (because we provided labels) and the "logits"--the model
+        # outputs prior to activation.
+        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+        loss = output[0]
+        logits = output[1]
+
+        # Accumulate the training loss over all of the batches so that we can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value 
+        # from the tensor.
+        total_train_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)            
+    
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epcoh took: {:}".format(training_time))
+        
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure our performance on
+    # our validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables 
+    total_eval_accuracy = 0
+    total_eval_loss = 0
+    nb_eval_steps = 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using 
+        # the `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+        
+        # Tell pytorch not to bother with constructing the compute graph during
+        # the forward pass, since this is only needed for backprop (training).
+        with torch.no_grad():        
+
+            # Forward pass, calculate logit predictions.
+            # token_type_ids is the same as the "segment ids", which 
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here: 
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+            loss = output[0]
+            logits = output[1]
+            
+        # Accumulate the validation loss.
+        total_eval_loss += loss.item()
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+
+        # Calculate the accuracy for this batch of test sentences, and
+        # accumulate it over all batches.
+        total_eval_accuracy += flat_accuracy(logits, label_ids)
+        
+
+    # Report the final accuracy for this validation run.
+    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
+    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
+
+    # Calculate the average loss over all of the batches.
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    
+    # Measure how long the validation run took.
+    validation_time = format_time(time.time() - t0)
+    
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Valid. Accur.': avg_val_accuracy,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+
+print("")
+print("Training complete!")
+
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+
+params = list(model.named_parameters())
+
+print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+print('==== Embedding Layer ====\n')
+
+for p in params[0:5]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== First Transformer ====\n')
+
+for p in params[5:21]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== Output Layer ====\n')
+
+for p in params[-4:]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+
+import os
+
+# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+from datetime import datetime as dt
+
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+now = dt.now().strftime(fTimeFormat)
+
+output_dir = modCovClassPath + now + "/"
+
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+print("Saving model to %s" % output_dir)
+
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+model_to_save.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+
+# Good practice: save your training arguments together with the trained model
+# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+
+import pandas as pd
+
+# Display floats with two decimal places.
+pd.set_option('display.precision', 2)
+
+# Create a DataFrame from our training statistics.
+df_stats = pd.DataFrame(data=training_stats)
+
+# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
+df_stats = df_stats.set_index('epoch')
+
+# A hack to force the column headers to wrap.
+#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+
+
+# Display the table.
+df_stats
+df_stats.to_csv(output_dir + now + ".csv")
Author	SHA1	Message	Date
Michael Beck	89b4755c65	adds link to full package to readme	2023-08-31 01:23:38 +02:00
Michael Beck	01e58b1b99	adds html files to gitignore	2023-08-31 01:21:31 +02:00
Michael Beck	d0fcefedf4	data/OUT/profiles/CovTweets.html gelöscht	2023-08-31 01:20:39 +02:00
Michael Beck	71cf907249	data/OUT/profiles/AllTweets.html gelöscht	2023-08-31 01:20:31 +02:00
Michael Beck	a9018fedee	REALLY corrects the filetree	2023-08-30 21:54:13 +02:00
Michael Beck	d94a93295f	corrects filetree	2023-08-30 21:53:05 +02:00
Michael Beck	80b63b39df	adds readme	2023-08-30 21:45:38 +02:00
Michael Beck	d8136909c8	corrects import of own functions that didn't work anymore because of a newer python version.	2023-08-30 21:45:27 +02:00
Michael Beck	1c6d9d5415	cleans and renames files	2023-08-30 21:18:55 +02:00
Michael Beck	4e08cde317	finishes classification scripts	2023-08-16 10:06:16 +02:00
Michael Beck	2535683cdc	finishes classification scripts	2023-08-15 14:51:28 +02:00
Michael Beck	8f744a08be	adds final counter keywords	2023-08-15 14:30:40 +02:00
Michael Beck	df5fd51a5f	repairs stupid	2023-08-15 14:30:13 +02:00
Michael Beck	3d4f559d2d	adds model training stats	2023-08-15 14:29:42 +02:00
Michael Beck	2e067b6a64	adds both classification scripts. Corrects inclusion of CleanTweets functions.	2023-08-15 14:23:56 +02:00
Michael Beck	7a16526a97	adds dataset profiles	2023-08-15 14:20:13 +02:00
Michael Beck	b89b5969ec	adds typerror controls	2023-08-15 14:19:33 +02:00
Michael Beck	7c6b618272	adds both training scripts and evaluation files of topic classification	2023-08-15 14:19:08 +02:00
Michael Beck	90aa58239c	adds generation of model-training dataset	2023-08-14 15:37:30 +02:00
Michael Beck	1beff96ae9	adds model training code	2023-08-14 15:37:05 +02:00
Michael Beck	881d3d6d6d	adds tweet-text-cleaning functions	2023-08-14 15:36:46 +02:00
Michael Beck	5a63c478e9	adds dataset profiler	2023-08-08 15:32:12 +02:00
Michael Beck	ed61d52182	adds files to gitignore	2023-08-08 00:07:42 +02:00
Michael Beck	a26d150060	renames pretest classification file	2023-08-08 00:06:18 +02:00
Michael Beck	d791e4a293	adds classification file. adds removal of empty tweets after transormation for classification preparation	2023-08-08 00:04:14 +02:00
Michael Beck	d57b7a31b7	adds more counter keywords	2023-08-08 00:03:30 +02:00
Michael Beck	13d80124d3	adds lines with counterKeywords to remove non-covid tweets	2023-08-07 23:45:11 +02:00
Michael Beck	3de6d8f3ec	adds tweetLen column, converts keywords to lowercase and removes certain keywords	2023-08-07 23:07:29 +02:00
Michael Beck	899a99ba72	adds CleanTweets functions, creates Graphs	2023-07-07 18:18:51 +02:00
Michael Beck	817ec48478	corrects a lot of mistakes. adds keywords adds analyze.py adds pretest adds pretest ids	2023-07-07 00:16:44 +02:00
Michael Beck	c64904a64d	adds cleanTweets.py	2023-06-26 23:51:32 +02:00
Michael Beck	82830f13e2	„README.md“ ändern	2023-06-26 13:12:16 +02:00
Michael Beck	8c8a191952	„README.md“ hinzufügen	2023-06-26 13:12:04 +02:00
Michael Beck	71e10a62d3	adds senator data scraper	2023-06-23 23:53:31 +02:00
Michael Beck	90d5501ec8	adds comment	2023-06-23 23:53:01 +02:00
Michael Beck	340cca017c	corrects comments	2023-06-23 20:59:14 +02:00
Michael Beck	791cebc297	adds log folder	2023-06-23 20:49:35 +02:00
Michael Beck	6241484e83	adds gitkeep	2023-06-23 20:47:32 +02:00
Michael Beck	d73da8db98	Merge remote-tracking branch 'origin/master'	2023-06-23 20:42:58 +02:00
Michael Beck	6220c1841d	„collect.ipynb“ löschen	2023-06-23 20:41:56 +02:00
Michael Beck	27746cd886	changes folder structure of in- and output files	2023-06-23 20:39:40 +02:00
Michael Beck	02c3d055bd	adds comments. changes logfile format to .log	2023-06-23 20:34:46 +02:00
Michael Beck	dc2e17cc2f	adds docstrings to functions. adds several comments.	2023-06-23 20:26:16 +02:00