adds link to full package to readme

adds html files to gitignore
data/OUT/profiles/CovTweets.html gelöscht
2023-08-31 01:23:38 +02:00 · 2023-08-31 01:21:31 +02:00 · 2023-08-31 01:20:39 +02:00 · 2023-08-31 01:20:31 +02:00 · 2023-08-30 21:54:13 +02:00 · 2023-08-30 21:53:05 +02:00
29 changed files with 3023 additions and 118 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,8 @@
 **/*lock*
 **/*-slice*.csv
 **/*.zip
+**/*.html
+**/*.htm
 /ALL-SENATORS-LONG.csv
 /ALL-SENATORS.csv
 /collect2.py
--- a/.vscode/.gitignore
+++ b/.vscode/.gitignore
@ -0,0 +1 @@
+/settings.json
--- a/ClassificationFake.py
+++ b/ClassificationFake.py
@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "Tweets-Classified-Topic-Results.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+def encode_labels(label):
+    if label == 'True':
+        return 'False'
+    elif label == 'False':
+        return 'True'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
+
+dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True'] 
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label_fake'] = output_labels
+dfClassify['output_score_fake'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%
--- a/ClassificationTopic.py
+++ b/ClassificationTopic.py
@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label_topicCov'] = output_labels
+dfClassify['output_score_topicCov'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%
+## corrections
+def encode_labels(label):
+    if label == 'real':
+        return 'True'
+    elif label == 'fake':
+        return 'False'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+#still wrong, will be corrected in ClassificationFake.py
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,131 @@
+# Requirements
+
+- python 3.10+
+- snscrape 0.6.2.20230321+ (see git repo in this folder)
+- transformers 4.31.0
+- numpy 1.23.5
+- pandas 2.0.3
+- scikit-learn 1.3.0
+- torch 2.0.1
+
+# About
+
+This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
+Training only works with a prepared dataset in which the tweets are pre classified.
+More info in the comments of the scripts.
+Due to time constraints, most of the code is procedurally coded and ugly but effective.
+
+# How to
+
+Tested on Ubuntu 22.04. 
+If needed, the virual environment can be exported and send to you.
+
+All files in the folder data/in have to exist in order to execute the scripts.
+Execute in the following order:
+
+01 collect.py (see more for further info on scraping)
+02 collectSenData.py
+03 cleanTweets
+04 preTestClassification.py
+05 trainTopic.py
+06 trainFake.py
+07 ClassificationFake.py
+08 ClassificationTopic.py
+
+# Files & Folders
+
+Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
+
+```
+├── data
+│   ├── IN
+│   │   ├── counterKeywordsFinal.txt
+│   │   ├── counterKeywords.txt
+│   │   ├── keywords-raw.txt
+│   │   ├── keywords.txt
+│   │   ├── own_keywords.txt
+│   │   ├── pretest-tweets_fake.txt				contains tweet ids for pretest
+│   │   ├── pretest-tweets_not_fake.txt			contains tweet ids for pretest
+│   │   └── senators-raw.csv					senator datafile
+│   ├── OUT
+│   │   ├── ALL-SENATORS-TWEETS.csv
+│   │   ├── graphs
+│   │   │   ├── Timeline.png
+│   │   │   ├── Wordcloud-All.png
+│   │   │   └── Wordcloud-Cov.png
+│   │   ├── Pretest-Prep.csv
+│   │   ├── Pretest-Results.csv
+│   │   ├── Pretest-SENATORS-TWEETS.csv
+│   │   ├── profiles							dataset profiles
+│   │   │   ├── AllTweets.html
+│   │   │   └── CovTweets.html
+│   │   ├── SenatorsTweets-Final.csv
+│   │   ├── SenatorsTweets-OnlyCov.csv
+│   │   ├── SenatorsTweets-train-CovClassification.csv
+│   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv
+│   │   ├── SenatorsTweets-train-CovClassification.tsv
+│   │   ├── SenatorsTweets-train-FakeClassification.csv
+│   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
+│   │   ├── SenatorsTweets-train-FakeClassification.tsv
+│   │   ├── SenatorsTweets-Training.csv
+│   │   ├── SenatorsTweets-Training_WORKING-COPY.csv
+│   │   ├── topClass-PRETEST-Prep.csv
+│   │   ├── topClass-PRETEST-Results.csv
+│   │   ├── Tweets-All-slices.zip
+│   │   ├── Tweets-Classified-Fake-Prep.csv
+│   │   ├── Tweets-Classified-Fake-Results.csv
+│   │   ├── Tweets-Classified-Prep.csv
+│   │   ├── Tweets-Classified-Topic-Prep.csv
+│   │   ├── Tweets-Classified-Topic-Results.csv
+│   │   └── Tweets-Stub.csv
+├── funs
+│   ├── CleanTweets.py					2023-01-03T00:00:00Z		multiple functions to clean tweet contents for NLN-processing
+│   ├── ClearDupes.py							function for deletion of duplicate keywords
+│   ├── __init__.py
+│   ├── Scrape.py								scraper functions to be used for multiprocessing
+│   └── TimeSlice.py							time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
+├── log											logs of the scraping process
+│   ├── log_2023-06-23_21-06-10_err.log
+│   ├── log_2023-06-23_21-06-10.log
+│   └── log_2023-06-23_21-06-10_missing.log
+├── models
+│   ├── CovClass								Covid tweet classification model
+│   │   └── 2023-08-15_05-56-50
+│   │       ├── 2023-08-15_05-56-50.csv			training output
+│   │       ├── config.json
+│   │       ├── pytorch_model.bin
+│   │       ├── special_tokens_map.json
+│   │       ├── tokenizer_config.json
+│   │       ├── tokenizer.json
+│   │       └── vocab.txt
+│   └── FakeClass								Fake tweet classification model
+│       └── 2023-08-15_14-35-43
+│           ├── 2023-08-15_14-35-43.csv			training output
+│           ├── config.json
+│           ├── pytorch_model.bin
+│           ├── special_tokens_map.json
+│           ├── tokenizer_config.json
+│           ├── tokenizer.json
+│           └── vocab.txt
+├── snscrape									contains snscrape 0.6.2.20230321+ git repo
+├── ClassificationFake.py						classifies tweets as fake or non-fake, saves:
+│													Tweets-Classified-Fake-Prep.csv		- prepared training dataset
+│													Tweets-Classified-Fake-Results.csv	- Tweets-Classified-Topic-Results.csv with cov classification results
+├── ClassificationTopic.py						classifies tweet topic, saves: 
+│													Tweets-Classified-Topic-Prep.csv 	- prepared training dataset
+│													Tweets-Classified-Topic-Results.csv	- SenatorsTweets-OnlyCov.csv with cov classification results
+├── cleanTweets.py								Curates keywordlists 
+│												Merges senator and tweet datasets
+│												Creates multiple datasets:
+│													SenatorsTweets-Final.csv	- all tweets with keyword columns
+│													SenatorsTweets-OnlyCov.csv	- only covid tweets, filtered by keywordlist
+│													SenatorsTweets-Training.csv	- training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
+├── collect.py									scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
+├── collectSenData.py							scrapes senator account data, saves to ALL-SENATORS.csv
+├── createGraphs.py								creates wordcloud & timeline graphs
+├── preTestClassification.py					pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
+├── profiler.py									creates dataset profiles
+├── README.md									readme
+├── trainFake.py								training script for the fake tweet classification model
+└── trainTopic.py								training script for the tweet topic classification model
+```
--- a/cleanTweets.py
+++ b/cleanTweets.py
@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 26 20:36:43 2023
+
+@author: michael
+"""
+
+import pandas as pd
+# import pyreadstat
+import numpy as np
+import sys
+
+
+# Seet for training dataset generation
+seed = 86431891
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final"
+senCSVcCov = "SenatorsTweets-OnlyCov"
+senCSVcTrain = "SenatorsTweets-Training"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc + ".csv"
+senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
+senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
+senSAVcPath = wd + ud + senCSV + ".sav"
+senDTAcPath = wd + ud + senCSV + ".dta"
+senDatasetPath = wd + di + senDataset
+
+df = pd.read_csv(senCSVPath, dtype=(object))
+
+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from ClearDupes import deDupe
+
+mixed_columns = df.columns[df.nunique() != len(df)]
+print(mixed_columns)
+
+df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
+del df[df.columns[0]] # remove first col
+
+df['user.created'] = pd.to_datetime(df['user.created'])
+df['date'] = pd.to_datetime(df['date'])
+
+#%%
+# sort and generate id
+df = df.sort_values(by='date').reset_index() # sort df by date before generating id
+df["tid"] = df.index + 1 # create id column
+
+#%%
+# move id column to front 
+cols = list(df.columns.values) # Make a list of all of the columns in the df
+cols.pop(cols.index('tid')) # Remove id from list
+#cols.pop(cols.index('user')) # Remove id from list
+df = df[['tid']+cols] # Create new dataframe with ordered colums
+
+#%%
+###################
+# Keywords
+# read additional keywords from a file and write to list.
+keywords = []
+# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
+# Read the keywords from a file
+with open(f"{di}own_keywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+# write all keywords to file
+with open(f"{di}keywords-raw.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+
+# delete keywords ppe and china that lead to too many false positives
+removeWords = {'ppe', 'china'}
+keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
+keywords = [item for item in keywords if item not in removeWords ] # removes words
+    
+with open(f"{di}keywords.txt", "w") as file:
+    print("read keyword files")
+    for line in keywords:
+        file.write(f'{line}\n')
+
+# counter keywords
+# Read the keywords from a file
+counterKeywords = []
+with open(f"{di}counterKeywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        counterKeyword = line.strip()  # Remove the newline character
+        counterKeywords.append(counterKeyword)
+counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
+with open(f"{di}counterKeywordsFinal.txt", "w") as file:
+    print("read keyword files")
+    for line in counterKeywords:
+        file.write(f'{line}\n')
+
+#%%
+# overwrite keyword column
+df['keywords'] = np.nan
+df['keywords'] = (
+    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
+)
+df['counterKeywords'] = np.nan
+df['counterKeywords'] = (
+    df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
+)
+#%%
+# create boolean contains_keyword column
+df['contains_keyword'] = True
+df['contains_counterKeyword'] = True
+mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
+mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
+
+#%%
+pd.Series(df["user.id"]).is_unique
+
+#%%
+# Merge Datasets
+# get senator data
+cols = [
+    "name",
+    "id",
+    "state_short",
+    "party",
+    "class",
+    "ideology",
+    "start_serving",
+    "end_serving",
+    "time_in_office",
+    "not_in_office",
+    "last_congress",
+    "vote_share",
+    "next_closest_share",
+    "election_year",
+    "twitter_handle",
+    "alt_handle",
+    "date_of_birth",
+    "female",
+    "ethnicity",
+    "edu_level",
+    "edu_information",
+    "occup_level"]
+
+dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+
+dfSenA['alt'] = False
+dfSenB['alt'] = True
+
+dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
+dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
+dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
+
+dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
+dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
+df['user.username'] = df['user.username'].apply(str.lower)
+
+dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
+
+# %%
+# see if all senators are present in file
+dfAll = df.merge(dfSenAll, how='left',on='user.username')
+#check merge
+unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
+print(unique_usernames)
+# senatorisakson was dropped, is ok
+#%%
+# create covidtweets csv
+dfCov = dfAll[dfAll['contains_counterKeyword']==False]
+dfCov = dfCov[dfCov['contains_keyword']==True]
+dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
+
+#%%
+# create column with tweet length
+
+dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
+
+# reset df index and write to id column 
+dfCov.reset_index(drop=True, inplace=True)
+
+#%%
+# Export to csv, sav and dta
+dfAll.to_csv(senCSVcPath, encoding='utf-8')
+dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
+# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
+# =============================================================================
+# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
+# dfAllStata = dfAll.rename(columns={'class':'class_'})
+# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
+# print(dfAllStata.columns)
+# ====================================================df.id.str.len().value_counts()
+# =========================
+
+# %%
+# Create training dataset
+np.random.seed(seed); 
+dfTrain = pd.dfCov(np.random.rand(1800))
+# %%
+# Create training dataset
+np.random.seed(seed); 
+dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
+dfTrain = dfTrain[['tid', 'date', 'rawContent']]
+dfTrain['topicCovid'] = True
+dfTrain['fake'] = False
+dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
--- a/collect.py
+++ b/collect.py
@ -66,7 +66,6 @@ which is the final output.
 import os
 import pandas as pd
 import glob
-import time
 import sys
 from datetime import datetime
 import concurrent.futures
@ -91,7 +90,7 @@ file_alltweets = "ALL-SENATORS-TWEETS.csv"
 path_to_tweetdfs = wd + td

 # Name of logfile
-logfile = wd+"log/log_"
+logfile = f"{wd}log/log_"

 ###################
 # Define Timespan & time-format
@ -149,10 +148,12 @@ tweetDFColumns = [
 ################## do NOT change anything below this line ###################
 #############################################################################

-## Import functions
-from funs.TimeSlice import *
-from funs.ClearDupes import deDupe
-from funs.Scrape import scrapeTweets
+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from TimeSlice import get_Tslices
+from ClearDupes import deDupe
+from Scrape import scrapeTweets

 ################### 
 # Create logfile & log all outputs
@ -251,7 +252,7 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w"
        if file not in tweetfiles:
            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
        else:
-            fout.write('all slices scraped.')
+            fout.write(f'{file:<30}:all slices scraped.\n')
            
 ## Merge .csv files.
 # check if file_alltweets (previously scraped tweets that have been merged 
@ -272,6 +273,8 @@ if tweetfiles:
                fout.write(f.read())
 os.chdir(wd) # go back to wd

+################### 
+# finish logging
 # Report timing info.
 timeEndMerge = datetime.now()
 print("---")
--- a/collectSenData.py
+++ b/collectSenData.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun  23 21:49:11 2023
+
+@author: Michael
+
+collectSenData.py scrapes accounts of senators for the following data:the 
+number of followers, the number of users the twitter account is following, 
+and how long the twitter account has existed.
+
+# Requirements:
+    - snscrape 0.6.2.20230321+
+    - pandas 2.0+
+# IMPORTANT:
+This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is 
+included in 'snscrape/' as a git repository for better reproducibility. Earlier
+versions of snscrape will most likely fail to scrape all tweets because of 
+certain rate limits or other errors that may occur.
+Install snscrape from local git repo to make shure that it fits the used version.
+If snscrape is shall be installed from local repo, uncomment the following lines:
+
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+
+ 
+# How to use:
+"""
+
+import os
+import pandas as pd
+import glob
+import time
+import sys
+from datetime import datetime
+import concurrent.futures
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+
+# Name of logfile
+logfile = wd+"log/UserLog_"
+
+###################
+# Define Timespan & time-format
+# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
+ts_end = "2023-01-03T00:00:00Z"  # end of straping
+no_slices = 24  # Number of slices / time periods.
+
+# file time format
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+
+# Maximum tweets to be scraped by snscrape. Can be left untouched.
+maxTweets = 5000
+
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
+# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
+#   get subparams just like in user where user id can be obtained by user.id 
+userDFColumns = [
+    "id",
+    "username",
+    "followersCount",
+    "friendsCount",
+    "verified",
+    "created"
+]
+
+#############################################################################
+################## do NOT change anything below this line ###################
+#############################################################################
+
+from funs.Scrape import scrapeUsers, getHandles, printHandles
+from funs.TimeSlice import convertTime
+
+
+################### 
+# Create logfile & log all outputs
+#   there are three logfile types to be found in /log.
+#   should be self explanatory.
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
+sys.stderr = open(logfileErrors, "w")
+sys.stdout = open(logfilen, "w")
+
+
+###################
+# Senator Accounts
+# Get accounts & alt-accounts from Senators-Datafile
+accounts = getHandles(di)
+
+# Print accounts to be scraped
+print(printHandles(accounts))
+
+###################
+# Scraping
+# report time:
+timeStartScrape = datetime.now()
+print("Starting scraping at:")
+print(timeStartScrape.strftime(fTimeFormat))
+print("---")
+
+# Iterate over each Twitter account using multiprocessing
+listUsers = []
+# Iterate over each Twitter account using multiprocessing
+with concurrent.futures.ProcessPoolExecutor() as executor:
+    # List to store the scraping tasks
+    tasks = []
+    for handle in accounts:
+        # Schedule the scraping task
+        task = executor.submit(
+            scrapeUsers, handle, userDFColumns 
+        )
+        tasks.append(task)
+    
+    # Wait for all tasks to complete and retrieve results
+    for task in concurrent.futures.as_completed(tasks):
+        result = task.result()
+        listUsers.append(result)
+
+dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
+dfUsers.to_csv(senCSVPath, encoding='utf-8')
+
+# report time:
+timeEndScrape = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndScrape.strftime(fTimeFormat))
+
+# Report timing info.
+timeEndMerge = datetime.now()
+print("---")
+print("End of scraping at:")
+print(timeEndMerge.strftime(fTimeFormat))
+print("---")
+# calulate times:
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
+print(
+    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
+)
+print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
+print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
+
+print(listUsers)
+# close connection to logfiles.
+sys.stdout.close()
+sys.stderr.close()
--- a/createGraphs.py
+++ b/createGraphs.py
@ -0,0 +1,144 @@
+#%%
+#!/usr/bin/env python3
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
+import string
+#%%
+
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 26 20:36:43 2023
+
+@author: michael
+"""
+
+import pandas as pd
+# import pyreadstat
+# import numpy as np
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final.csv"
+senCSVcCov = "SenatorsTweets-OnlyCov.csv"
+
+# Outfiles
+wcAllTweetsF = "graphs/Wordcloud-All.png"
+wcCovTweetsF = "graphs/Wordcloud-Cov.png"
+TwCovTimeline = "graphs/Timeline.png"
+
+# don't change this one
+senCSVcPath = wd + ud + senCSVc
+senCSVcCovPath = wd + ud + senCSVcCov
+wcAllTweetsFPath = wd + ud + wcAllTweetsF
+wcCovTweetsFPath = wd + ud + wcCovTweetsF
+TwCovTimelinePath = wd + ud + TwCovTimeline
+
+#%%
+df = pd.read_csv(senCSVcPath, dtype=(object))
+dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
+#%%
+df['cleanContent'] = df['rawContent'].apply(remove_URL)
+df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
+df['cleanContent'] = df['cleanContent'].apply(remove_html)
+df['cleanContent'] = df['cleanContent'].apply(remove_punct)
+
+# create string with all cleaned tweets as text
+str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
+#%%
+dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
+dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
+
+# create string with all cleaned tweets as text
+str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
+#%%
+# replace single U and S characters
+str_covtweets = str_covtweets.replace(' u ', ' ') 
+str_covtweets = str_covtweets.replace(' s ', ' ') 
+str_alltweets = str_alltweets.replace(' u ', ' ') 
+str_alltweets = str_alltweets.replace(' s ', ' ') 
+
+
+# %%
+# create wordcloud alltweets
+wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
+wcA.generate(str_alltweets)
+
+#%%
+# draw
+plt.figure( figsize=(20,20))
+plt.axis("off")
+plt.imshow(wcA, interpolation="bilinear")
+fig1 = plt.gcf()
+plt.show()
+fig1.savefig(wcAllTweetsFPath)  
+
+# %%
+# create wordcloud covtweets
+wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
+wcC.generate(str_covtweets)
+#%%
+# draw
+plt.figure( figsize=(20,20))
+plt.axis("off")
+plt.imshow(wcC, interpolation="bilinear")
+fig2 = plt.gcf()
+plt.show()
+fig2.savefig(wcCovTweetsFPath)  
+# %%
+# with open('test.txt', 'w') as f:
+#    f.write(str_covtweets)
+# %%
+dfT = pd.DataFrame()
+dfT['date'] = df['date'].copy()
+dfT['count'] = 1
+
+dfCovT = pd.DataFrame()
+dfCovT['date'] = dfCov['date'].copy()
+dfCovT['count'] = 1
+#%%
+dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
+dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
+
+#%%
+dfT = dfT.groupby('date').count().reset_index()
+dfCovT = dfCovT.groupby('date').count().reset_index()
+
+#%%
+import matplotlib.dates as mdates
+# n of tweets overall
+my_dpi=300
+plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
+plt.style.use('seaborn-darkgrid')
+fig, ax = plt.subplots(figsize=(8, 6))
+ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
+ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
+ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
+ax.xaxis.set_minor_locator(mdates.MonthLocator())
+fig.autofmt_xdate()
+fig.savefig(TwCovTimelinePath)  
+
+
+# %%
--- a/data/IN/counterKeywords.txt
+++ b/data/IN/counterKeywords.txt
@ -0,0 +1,23 @@
+opioid
+gun violence
+gun-violence
+CHD
+Coronary heart disease
+addiction
+tobacco
+vaping
+e-cigarette
+shooting
+indigenous women
+overdose
+meth
+cocaine
+separated children
+separating children
+separating families
+Muslim travel ban 
+flu-season
+flu season
+Soleimani
+Muslim Ban
+USMCA trade deal
--- a/data/IN/counterKeywordsFinal.txt
+++ b/data/IN/counterKeywordsFinal.txt
@ -0,0 +1,23 @@
+meth
+gun violence
+flu season
+vaping
+chd
+addiction
+indigenous women
+separating children
+tobacco
+e-cigarette
+muslim ban
+soleimani
+cocaine
+separating families
+muslim travel ban
+usmca trade deal
+shooting
+overdose
+separated children
+coronary heart disease
+gun-violence
+opioid
+flu-season
--- a/data/IN/keywords.txt
+++ b/data/IN/keywords.txt
@ -0,0 +1,190 @@
+plandemic
+scamdemic
+wuhan flu
+wuhanflu
+corona
+coronavirusoutbreak
+pandemic
+epidemic
+vax
+antivax
+antivaxxers
+wearamask
+masksoff
+cdc
+ncov
+sars-cov-2
+socialdistancing
+wear a mask
+lockdown
+covd
+coronavirus
+koronavirus
+corona
+cdc
+wuhancoronavirus
+wuhanlockdown
+ncov
+wuhan
+n95
+kungflu
+epidemic
+outbreak
+sinophobia
+covid-19
+corona virus
+covid
+covid19
+sars-cov-2
+covidー19
+covd
+pandemic
+coronapocalypse
+canceleverything
+coronials
+socialdistancingnow
+social distancing
+socialdistancing
+panicbuy
+panic buy
+panicbuying
+panic buying
+14dayquarantine
+duringmy14dayquarantine
+panic shop
+panic shopping
+panicshop
+inmyquarantinesurvivalkit
+panic-buy
+panic-shop
+coronakindness
+quarantinelife
+chinese virus
+chinesevirus
+stayhomechallenge
+stay home challenge
+sflockdown
+dontbeaspreader
+lockdown
+lock down
+shelteringinplace
+sheltering in place
+staysafestayhome
+stay safe stay home
+trumppandemic
+trump pandemic
+flattenthecurve
+flatten the curve
+china virus
+chinavirus
+quarentinelife
+ppeshortage
+saferathome
+stayathome
+stay at home
+stay home
+stayhome
+getmeppe
+covidiot
+epitwitter
+pandemie
+wear a mask
+wearamask
+kung flu
+covididiot
+covid__19
+omicron
+variant
+vaccine
+travel ban
+corona
+corona
+coronavirus
+coronavirus
+covid
+covid
+covid19
+covid19
+covid-19
+covid-19
+sarscov2
+sarscov2
+sars cov2
+sars cov 2
+covid_19
+covid_19
+ncov
+ncov
+ncov2019
+ncov2019
+2019-ncov
+2019-ncov
+pandemic
+pandemic 2019ncov
+2019ncov
+quarantine
+quarantine
+flatten the curve
+flattening the curve
+flatteningthecurve
+flattenthecurve
+hand sanitizer
+handsanitizer
+lockdown
+lockdown
+social distancing
+socialdistancing
+work from home
+workfromhome
+working from home
+workingfromhome
+n95
+n95
+covidiots
+covidiots
+herd immunity
+herdimmunity
+pneumonia
+pneumonia
+chinese virus
+chinesevirus
+wuhan virus
+wuhanvirus
+kung flu
+kungflu
+wearamask
+wearamask
+wear a mask
+vaccine
+vaccines
+vaccine
+vaccines
+corona vaccine
+corona vaccines
+coronavaccine
+coronavaccines
+face shield
+faceshield
+face shields
+faceshields
+health worker
+healthworker
+health workers
+healthworkers
+stayhomestaysafe
+coronaupdate
+frontlineheroes
+coronawarriors
+homeschool
+homeschooling
+hometasking
+masks4all
+wfh
+wash ur hands
+wash your hands
+washurhands
+washyourhands
+stayathome
+stayhome
+selfisolating
+self isolating
--- a/data/IN/own_keywords.txt
+++ b/data/IN/own_keywords.txt
@ -0,0 +1,20 @@
+plandemic
+scamdemic
+wuhan flu
+wuhanflu
+corona
+coronavirusoutbreak
+pandemic
+epidemic
+vax
+antivax
+antivaxxers
+wearamask
+masksoff
+cdc
+ncov
+sars-cov-2
+socialdistancing
+wear a mask
+lockdown
+covd
--- a/data/IN/pretest-tweets_fake.txt
+++ b/data/IN/pretest-tweets_fake.txt
@ -0,0 +1,50 @@
+1486474031419297799
+1504880316506263552
+1264663210197745665
+1479500294887256069
+1320058585590734852
+1539003407096336388
+1481704942574395392
+1572014646374154240
+1524764580806811649
+1592940763515858944
+1554529221594292224
+1479488991347023876
+1481715928492609541
+1476722414100914179
+1478478958740086790
+1459285859358982148
+1475620600228028432
+1479459200229117955
+1448386057339297797
+1468993886316077063 
+1448369102318362625 
+1444354461799956482
+1431340411193331715
+1583474056011010048
+1450479481278406658
+1396992539010469894
+1396992534623174658
+1417920232333656076
+1439553348122861568
+1598398871990079489
+1502768541979881479
+1337604370981134336
+1417797808707473410
+1601693432292192256
+1598145048989704192
+1599906362380591110
+1325851780496961538
+1468908159330885632
+1468332389923311616
+1339703372505624577
+1468633243654451200
+1488290848907444240
+1491146722625880064
+1481766558313730053
+1503078235373985795
+1485398845718773762 
+1371501907483754497
+1494398809245376513
+1436328255959801865
+1482862501461209089
--- a/data/IN/pretest-tweets_not_fake.txt
+++ b/data/IN/pretest-tweets_not_fake.txt
@ -0,0 +1,50 @@
+1258402212327436288
+1489758168750174209
+1303698927766646785
+1257681474670809090
+1340109389672411136
+1303698924444803072
+1303698926902665218
+1337595387796983809
+1344441446515019777
+1385680800218324992
+1590129838261956608
+1303698928609697796
+1348715183502454793
+1340418291274289153
+1421228572732280835
+1456349962942533637
+1603457599877308416
+1278354646885687296
+1340418294579421188
+1365866032792039425
+1472722005657112578
+1381021635772350464
+1337598897217220609
+1354797645261398016
+1266806429282963456
+1429847265242460161 
+1234272677633953792
+1301581247932772352
+1424832183148204043
+1339255967809212416
+1284831896988454912
+1463528081214394377 
+1453679912938885122
+1583474059148337152
+1519791965113622528
+1470775155110682628
+1464615554103357450
+1337595385565638657
+1436055743418019840
+1572208051830104069
+1433765113891328002
+1482774656075534336
+1310288545886736384
+1353845938566156289
+1396992537202659329
+1455712525362810883
+1340384267327647747
+1338588364459618305
+1376696928692412419
+1340386565399429123
--- a/data/IN/senators-raw.csv
+++ b/data/IN/senators-raw.csv
@ -1,112 +1,111 @@
-name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female, ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
-"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander ,LamarAlexander ,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
-"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi?lang=zh-Hant ,SenatorEnzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
+name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female,ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
+"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander,LamarAlexander,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
+"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi,senatorenzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
 "Gardner, Cory",3,Colorado,CO,0,2,0.719285383539398,01/06/2015,01/03/2021,5.9972602739726,1,116,48.5,46,2014,https://twitter.com/CoryGardner,CoryGardner,https://twitter.com/corygardner,corygardner,08/22/1974,0,White,8,"J.D.; University of Colorado, Boulder; 2001",2,N/A,https://bioguide.congress.gov/search/bio/G000562,,
 "Harris, Kamala",4,California ,CA,1,3,0.0213759569468058,01/03/2017,01/18/2021,4.04383561643836,1,116,62.4,37.6,2016,https://twitter.com/VP,VP,https://twitter.com/KamalaHarris,KamalaHarris,10/20/1964,1,African-American; Asian-American,8,J.D.; University of California; 1989,2,N/A,https://bioguide.congress.gov/search/bio/H001075,(became VP on jan 20 2021),
-"Isakson, John",5,Georgia,GA,0,3,*,01/03/2005,12/31/2019,14,1,116,55,40.8,2016,https://twitter.com/SenatorIsakson ,SenatorIsakson,N/A,N/A,12/28/1944,0,White,6,"University of Georgia, Athens; 1966",1,N/A,https://bioguide.congress.gov/search/bio/I000055,(died in 2019),
-"Jones, Gordon Douglas",6,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
-"Loeffler, Kelly",7,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler ,senatorloeffler ,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
-"McSally, Martha",8,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
-"Perdue, David",9,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
-"Roberts, Charles Patrick",10,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
-"Udall, Tom",11,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
-"Baldwin, Tammy",12,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
-"Barrasso, John",13,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
-"Bennet, Michael F.",14,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
-"Blackburn, Marsha",15,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
-"Blumenthal, Richard",16,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
-"Blunt, Roy",17,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
-"Booker, Cory A.",18,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
-"Boozman, John",19,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
-"Braun, Michael",20,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
-"Brown, Sherrod",21,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
-"Burr, Richard",22,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
-"Cantwell, Maria",23,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
-"Capito, Shelley Moore",24,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
-"Cardin, Benjamin L.",25,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
-"Carper, Thomas R.",26,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
-"Casey, Robert P., Jr.",27,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
-"Cassidy, Bill",28,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
-"Collins, Susan M.",29,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
-"Coons, Christopher A.",30,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
-"Cornyn, John",31,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary<72>s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
-"Cortez Masto, Catherine",32,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
-"Cotton, Tom",33,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
-"Cramer, Kevin",34,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
-"Crapo, Michael",35,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
-"Cruz, Ted",36,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
-"Daines, Steve",37,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
-"Duckworth, Tammy",38,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
-"Durbin, Richard J.",39,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
-"Ernst, Joni",40,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
-"Feinstein, Dianne",41,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
-"Fischer, Debra",42,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
-"Gillibrand, Kirsten E.",43,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
-"Graham, Lindsey",44,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
-"Grassley, Chuck",45,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
-"Hagerty, Bill",46,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
-"Hassan, Margaret Wood",47,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
-"Hawley, Josh",48,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
-"Heinrich, Martin",49,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,N/A,N/A,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
-"Hickenlooper, John W.",50,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
-"Hirono, Mazie K.",51,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
-"Hoeven, John",52,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
-"Hyde-Smith, Cindy",53,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
-"Inhofe, James",54,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
-"Johnson, Ron",55,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
-"Kaine, Tim",56,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
-"Kelly, Mark",57,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
-"Kennedy, John Neely",58,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
-"King, Angus S., Jr.",59,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
-"Klobuchar, Amy",60,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
-"Lankford, James",61,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
-"Leahy, Patrick",62,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
-"Lee, Mike",63,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
-"Luj<EFBFBD>n, Ben Ray",64,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
-"Lummis, Cynthia M.",65,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
-"Manchin, Joe, III",66,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
-"Markey, Edward J.",67,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
-"Marshall, Roger",68,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
-"McConnell, Mitch",69,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
-"Menendez, Robert",70,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
-"Merkley, Jeff",71,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
-"Moran, Jerry",72,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
-"Murkowski, Lisa",73,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
-"Murphy, Christopher",74,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
-"Murray, Patty",75,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
-"Ossoff, Jon",76,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
-"Padilla, Alex",77,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
-"Paul, Rand",78,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
-"Peters, Gary C.",79,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
-"Portman, Robert",80,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
-"Reed, John F.",81,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
-"Risch, James E.",82,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
-"Romney, Mitt",83,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
-"Rosen, Jacky",84,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
-"Rounds, Mike",85,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
-"Rubio, Marco",86,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
-"Sanders, Bernard",87,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
-"Sasse, Benjamin",88,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
-"Schatz, Brian",89,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
-"Schumer, Charles E.",90,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
-"Scott, Rick",91,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
-"Scott, Tim",92,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
-"Shaheen, Jeanne",93,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
-"Shelby, Richard",94,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
-"Sinema, Kyrsten",95,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
-"Smith, Tina",96,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
-"Stabenow, Debbie",97,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
-"Sullivan, Dan",98,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
-"Tester, Jon",99,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
-"Thune, John",100,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
-"Tillis, Thom",101,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
-"Toomey, Patrick",102,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
-"Tuberville, Tommy",103,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
-"Van Hollen, Chris",104,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
-"Warner, Mark R.",105,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
-"Warnock, Raphael G.",106,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
-"Warren, Elizabeth",107,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
-"Whitehouse, Sheldon",108,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
-"Wicker, Roger F.",109,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
-"Wyden, Ron",110,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
-"Young, Todd",111,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
+"Jones, Gordon Douglas",5,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
+"Loeffler, Kelly",6,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler,senatorloeffler,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
+"McSally, Martha",7,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
+"Perdue, David",8,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
+"Roberts, Charles Patrick",9,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
+"Udall, Tom",10,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
+"Baldwin, Tammy",11,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
+"Barrasso, John",12,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
+"Bennet, Michael F.",13,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
+"Blackburn, Marsha",14,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
+"Blumenthal, Richard",15,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
+"Blunt, Roy",16,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
+"Booker, Cory A.",17,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
+"Boozman, John",18,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
+"Braun, Michael",19,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
+"Brown, Sherrod",20,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
+"Burr, Richard",21,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
+"Cantwell, Maria",22,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
+"Capito, Shelley Moore",23,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
+"Cardin, Benjamin L.",24,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
+"Carper, Thomas R.",25,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
+"Casey, Robert P., Jr.",26,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
+"Cassidy, Bill",27,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
+"Collins, Susan M.",28,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
+"Coons, Christopher A.",29,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
+"Cornyn, John",30,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary’s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
+"Cortez Masto, Catherine",31,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
+"Cotton, Tom",32,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
+"Cramer, Kevin",33,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
+"Crapo, Michael",34,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
+"Cruz, Ted",35,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
+"Daines, Steve",36,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
+"Duckworth, Tammy",37,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
+"Durbin, Richard J.",38,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
+"Ernst, Joni",39,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
+"Feinstein, Dianne",40,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
+"Fischer, Debra",41,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
+"Gillibrand, Kirsten E.",42,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
+"Graham, Lindsey",43,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
+"Grassley, Chuck",44,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
+"Hagerty, Bill",45,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
+"Hassan, Margaret Wood",46,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
+"Hawley, Josh",47,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
+"Heinrich, Martin",48,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,https://twitter.com/senatorheinrich,senatorheinrich,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
+"Hickenlooper, John W.",49,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
+"Hirono, Mazie K.",50,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
+"Hoeven, John",51,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
+"Hyde-Smith, Cindy",52,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
+"Inhofe, James",53,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
+"Johnson, Ron",54,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
+"Kaine, Tim",55,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
+"Kelly, Mark",56,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
+"Kennedy, John Neely",57,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
+"King, Angus S., Jr.",58,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
+"Klobuchar, Amy",59,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
+"Lankford, James",60,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
+"Leahy, Patrick",61,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
+"Lee, Mike",62,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
+"Luján, Ben Ray",63,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
+"Lummis, Cynthia M.",64,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
+"Manchin, Joe, III",65,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
+"Markey, Edward J.",66,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
+"Marshall, Roger",67,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
+"McConnell, Mitch",68,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
+"Menendez, Robert",69,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
+"Merkley, Jeff",70,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
+"Moran, Jerry",71,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
+"Murkowski, Lisa",72,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
+"Murphy, Christopher",73,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
+"Murray, Patty",74,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
+"Ossoff, Jon",75,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
+"Padilla, Alex",76,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
+"Paul, Rand",77,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
+"Peters, Gary C.",78,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
+"Portman, Robert",79,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
+"Reed, John F.",80,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
+"Risch, James E.",81,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
+"Romney, Mitt",82,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
+"Rosen, Jacky",83,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
+"Rounds, Mike",84,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
+"Rubio, Marco",85,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
+"Sanders, Bernard",86,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
+"Sasse, Benjamin",87,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
+"Schatz, Brian",88,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
+"Schumer, Charles E.",89,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
+"Scott, Rick",90,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
+"Scott, Tim",91,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
+"Shaheen, Jeanne",92,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
+"Shelby, Richard",93,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
+"Sinema, Kyrsten",94,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
+"Smith, Tina",95,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
+"Stabenow, Debbie",96,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
+"Sullivan, Dan",97,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
+"Tester, Jon",98,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
+"Thune, John",99,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
+"Tillis, Thom",100,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
+"Toomey, Patrick",101,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
+"Tuberville, Tommy",102,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
+"Van Hollen, Chris",103,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
+"Warner, Mark R.",104,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
+"Warnock, Raphael G.",105,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
+"Warren, Elizabeth",106,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
+"Whitehouse, Sheldon",107,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
+"Wicker, Roger F.",108,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
+"Wyden, Ron",109,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
+"Young, Todd",110,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
--- a/data/OUT/.gitignore
+++ b/data/OUT/.gitignore
@ -0,0 +1,8 @@
+/ALL-SENATORS-TWEETS.csv
+/Pretest-Prep.csv
+/Pretest-Results.csv
+/Pretest-SENATORS-TWEETS.csv
+/SenatorsTweets-Final.csv
+/SenatorsTweets-OnlyCov.csv
+/Tweets-Classified-Prep.csv
+/Tweets-Stub.csv
--- a/data/OUT/graphs/.gitignore
+++ b/data/OUT/graphs/.gitignore
@ -0,0 +1,3 @@
+/Timeline.png
+/Wordcloud-All.png
+/Wordcloud-Cov.png
--- a/funs/CleanTweets.py
+++ b/funs/CleanTweets.py
@ -0,0 +1,89 @@
+import re
+import string
+
+def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
+    preprocessed_text = []
+    for t in text.split():
+        if len(t) > 1:
+            t = '@user' if t[0] == '@' and t.count('@') == 1 else t
+            t = 'http' if t.startswith('http') else t
+        preprocessed_text.append(t)
+    return ' '.join(preprocessed_text)
+
+def remove_URL(text):
+    try: 
+        url = re.compile(r'https?://\S+|www\.\S+')
+    except: print(text)
+    return url.sub(r'', text)
+
+def remove_emoji(text):
+    emoji_pattern = re.compile(
+        '['
+        u'\U0001F600-\U0001F64F'  # emoticons
+        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
+        u'\U0001F680-\U0001F6FF'  # transport & map symbols
+        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
+        u'\U00002702-\U000027B0'
+        u'\U000024C2-\U0001F251'
+        ']+',
+        flags=re.UNICODE)
+    return emoji_pattern.sub(r'', text)
+
+def remove_html(text):
+    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
+    return re.sub(html, '', text)
+
+def remove_punct(text):
+    table = str.maketrans('', '', string.punctuation)
+    return text.translate(table)
+
+def remove_nonascii(text):
+    return re.sub(r'[^\x00-\x7F]+', '', text)
+
+def remove_spec(text):
+    text = re.sub(r'&amp;?', r'and', text)
+    text = re.sub(r'&lt;', r'<', text)
+    return re.sub(r'&gt;', r'>', text)
+
+def remove_spaces(text): # also new line chars and to lower case
+    text = re.sub(r'&lt;', r'<', text)
+    text = " ".join(text.splitlines()) # remove newline characters
+    text = text.lower()
+    text = text.strip()
+    return re.sub(r'\s{2,}', ' ', text)
+
+def remove_retw(text):
+    text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
+    return re.sub(r'@[\S]+', '', text)
+
+def preprocess_text(text):
+    text = remove_URL(text)
+    text = remove_emoji(text)
+    text = remove_html(text)
+    text = remove_punct(text)
+    text = remove_nonascii(text)
+    text = remove_spec(text)
+    text = remove_spaces(text)
+    text = remove_retw(text)
+    return text
+
+def preprocess_text_series(series):
+    series = series.apply(remove_URL)
+    series = series.apply(remove_emoji)
+    series = series.apply(remove_html)
+    series = series.apply(remove_punct)
+    series = series.apply(remove_nonascii)
+    series = series.apply(remove_spec)
+    series = series.apply(remove_spaces)
+    series = series.apply(remove_retw)
+    return series
+
+# Check all functions:
+input_text = """
+    Check out this amazing website: https://www.example.com! 😃
+    <html>This is an HTML tag.</html>
+    RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
+    This is a test text with lots of punctuations!!! Can't wait to see more...
+"""
+processed_text = preprocess_text(input_text)
+# print(processed_text)
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
    # save short csv
    tweet_df.to_csv(csv_path, encoding='utf-8')
    # sleep 1 second to not get blocked because of excessive requests
-    time.sleep(0.5)
+    time.sleep(0.5)
+
+def getHandles(di):
+    """grabs accounts from senators-raw.csv
+
+    Args:
+        di (str): path to senators-raw.csv
+
+    Returns:
+        list: list containing str of senator account handles
+    """
+    accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
+    alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
+    alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
+    accounts.extend(alt_accounts)
+    return accounts
+
+def printHandles(accounts):
+    """returns string with all accounts in a readable way.
+
+    Args:
+        accounts (list): list of str with handles
+
+    Returns:
+        str: containing text that can be written to txtfile
+    """
+    txt = ["Accounts to be scraped:\n"]
+    for i, acc in enumerate(accounts): # print 5 accounts per line
+        txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
+        if i % 5 == 4: 
+            txt.append(" \n")
+    txt.append(f"\n{i} accounts in total.")
+    return ''.join(txt)
+
+def scrapeUsers(handle, userDFColumns, maxTweets=1):
+    currentTime = datetime.now()
+    userList = []
+    print(f'{currentTime:<30} Fetching: {handle:>15}')
+    query = f'from:{handle}'
+    
+    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
+        if i > maxTweets:
+            break
+        # Get user data and append to singleUserList
+        userList = []
+        for col in userDFColumns:
+            singleUser = eval(f'tweet.user.{col}') 
+            userList.append(singleUser)
+            
+    # Create dataframe using userList and userDFColumns
+    #df = pd.DataFrame(userList, columns=userDFColumns)
+    return userList
--- a/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
+++ b/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
+2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
+3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
+4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
+5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
+6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
--- a/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
+++ b/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
+2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
+3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
+4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
+5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
+6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
--- a/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
+++ b/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
+2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
+3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
+4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
+5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
+6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
--- a/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
+++ b/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
@ -0,0 +1,7 @@
+epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
+1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
+2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
+3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
+4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
+5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
+6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
--- a/preTestClassification.py
+++ b/preTestClassification.py
@ -0,0 +1,135 @@
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+
+#%%
+# prepare
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of new datafile generated
+senCSVc = "Tweets-Stub.csv"
+
+# Name of pretest files
+preTestIDsFake = "pretest-tweets_fake.txt"
+preTestIDsNot = "pretest-tweets_not_fake.txt"
+
+# Name of pretest datafile
+senCSVPretest = "Pretest.csv"
+senCSVPretestPrep = "Pretest-Prep.csv"
+senCSVPretestResult = "Pretest-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc
+senCSVcPretestPath = wd + ud + senCSVPretest
+senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
+senCSVcPretestResultPath = wd + ud + senCSVPretestResult
+preTestIDsFakePath = wd + di + preTestIDsFake
+preTestIDsNotPath = wd + di + preTestIDsNot
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# List of IDs to select
+# Read the IDs from a file
+preTestIDsFakeL = []
+preTestIDsNotL  = []
+with open(preTestIDsFakePath, "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        tid = line.strip()  # Remove the newline character
+        preTestIDsFakeL.append(tid)
+with open(preTestIDsNotPath, "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        tid = line.strip()  # Remove the newline character
+        preTestIDsNotL.append(tid)
+
+# Select rows based on the IDs
+df = pd.read_csv(senCSVPath, dtype=(object))
+#%%
+# Create pretest dataframe
+dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
+dfPreTest['fake'] = True
+dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
+dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
+model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
+
+# %%
+results = pipe(KeyDataset(dataset, "text"))
+# %%
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfPreTest['output_label'] = output_labels
+dfPreTest['output_score'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
+
+# %%
+dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
+
+# %%
--- a/profiler.py
+++ b/profiler.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  8 14:49:02 2023
+
+@author: michael
+"""
+
+import pandas as pd
+import pandas_profiling as pp
+import numpy
+  
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
+# Name of new datafile generated
+senCSVc = "SenatorsTweets-Final"
+senCSVcCov = "SenatorsTweets-OnlyCov"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSVc + ".csv"
+senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
+senSAVcPath = wd + ud + senCSV + ".sav"
+senDTAcPath = wd + ud + senCSV + ".dta"
+senDatasetPath = wd + di + senDataset
+  
+# forming dataframe and printing
+df = pd.read_csv(senCSVPath, dtype=(object))
+  
+# forming ProfileReport and save
+# as output.html file
+profileAll = pp.ProfileReport(df, minimal=True)
+profileAll.to_file("data/OUT/profiles/AllTweets.html")
+
+df = pd.read_csv(senCSVcCovPath, dtype=(object))
+
+profileAll = pp.ProfileReport(df, minimal=True)
+profileAll.to_file("data/OUT/profiles/CovTweets.html")
--- a/repairmystupidity.py
+++ b/repairmystupidity.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 14 20:47:22 2023
+
+@author: michael
+"""
+import pandas as pd
+
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
+richtig = wd + ud + "SenatorsTweets-Training.csv"
+correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
+
+# Name of new datafile generated
+senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
+
+# don't change this one
+falsch = pd.read_csv(falsch, dtype=(object), sep=";")
+richtig = pd.read_csv(richtig, dtype=(object))
+
+df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
+df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
+df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
+df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
+df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
+
+df.to_csv(correct, encoding='utf-8', sep=";")
--- a/trainFake.py
+++ b/trainFake.py
@ -0,0 +1,613 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 12 12:25:18 2023
+
+@author: michael
+"""
+#from datasets import load_dataset
+#from transformers import Trainer
+#from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import torch 
+import numpy as np
+from sklearn.model_selection import train_test_split # pip install scikit-learn
+
+import pandas as pd
+
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Training CSV dataset
+twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
+twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
+twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
+statsTrainingTopicClass = "statsTopicClassification-"
+
+# don't change this one
+twtCSVPath = wd + ud + twtCSV + ".csv"
+twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
+twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
+
+statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
+
+twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
+twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
+twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
+twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
+
+twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
+twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
+
+seed = 12355
+
+# Model paths
+modCovClassPath = wd + "models/CovClass/"
+modFakeClassPath = wd + "models/FakeClass/"
+
+model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
+#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
+#model_name = "cardiffnlp/tweet-topic-latest-multi"
+model_name = "bvrau/covid-twitter-bert-v2-struth"
+#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
+model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
+
+# More models for fake detection:
+# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+max_length = 64 # max token sentence length
+
+#%%
+# Create training and testing dataset
+dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
+
+#dfTest = dfTest[:-900] # remove last 800 rows
+#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
+
+dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
+
+dfTest.drop(columns=['rawContent'], inplace=True)
+
+# Only keep tweets that are longer than 3 words
+dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
+dfTest['tweet_proc_length'].value_counts()
+dfTest = dfTest[dfTest['tweet_proc_length']>3]
+dfTest = dfTest.drop_duplicates(subset=['text'])
+dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
+
+# Create datasets for each classification
+dfCovClass = dfTest
+dfFakeClass = dfTest
+dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
+dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
+
+#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
+dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
+dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
+
+#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
+dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
+
+#%%
+# Tokenize tweets
+dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
+dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
+dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
+dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+
+def encode_labels(label):
+    if label == 'Covid':
+        return 1
+    elif label == 'NonCovid':
+        return 0
+    elif label == 'False':
+        return 1
+    elif label == 'True':
+        return 0
+    return 0
+dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
+dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
+dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
+#dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
+
+# get n of classes
+print("# of Non-Covid tweets (coded 0):")
+print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
+
+print("# of Fake-news tweets (coded 1):")
+print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+
+# create disproportionate sample - 50/50 of both
+#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
+#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
+# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
+# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
+
+'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
+dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
+dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
+dfCovClassab.reset_index(inplace=True)
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
+'''
+
+# create training and validation samples
+dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
+
+# reset index and drop unnecessary columns
+dfFakeClass_train.reset_index(drop=True, inplace=True)
+dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
+dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+dfFakeClass_test.reset_index(drop=True, inplace=True)
+dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
+dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+# save dfs as csvs and tsvs, for training and validation
+# covid classification datafiles
+# rows 0-41 = noncovid, 42-81 covid, therfore:
+#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
+#dfCovClass.reset_index(inplace=True, drop=True)
+#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
+#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
+
+# fake news classification datafiles
+#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
+
+#%%
+# Prepare trainer
+#from transformers import TrainingArguments
+
+#training_args = TrainingArguments(
+#     report_to = 'wandb',
+#    output_dir=wd+'results',          # output directory/
+#   overwrite_output_dir = True,
+#    num_train_epochs=6,              # total number of training epochs
+#    per_device_train_batch_size=8,  # batch size per device during training
+#    per_device_eval_batch_size=16,   # batch size for evaluation
+#    learning_rate=2e-5,
+#    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
+#    weight_decay=0.01,               # strength of weight decay
+#    logging_dir='./logs3',            # directory for storing logs
+#    logging_steps=1000,
+#    evaluation_strategy="epoch",
+#    save_strategy="epoch",
+#    load_best_model_at_end=True
+#)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+from transformers import BertForSequenceClassification, AdamW#, BertConfig
+#from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+
+"""
+train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
+train_dataset = train_dataset['train'] 
+eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
+eval_dataset = eval_dataset['test'] 
+"""
+batch_size = 1
+
+from torch.utils.data import Dataset
+
+class PandasDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length):
+        self.dataframe = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, index):
+        row = self.dataframe.iloc[index]
+        text = row['text']
+        labels = row['labels_encoded']
+        
+        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
+        input_ids = torch.tensor(encoded['input_ids'])
+        attention_mask = torch.tensor(encoded['attention_mask'])
+        
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(labels)  # Assuming labels are already encoded
+        }
+
+
+train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
+train_dataloader = DataLoader(
+    train_dataset,
+    sampler=RandomSampler(train_dataset),
+    batch_size=batch_size
+)
+
+eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
+validation_dataloader = DataLoader(
+    eval_dataset,
+    sampler=SequentialSampler(eval_dataset),
+    batch_size=batch_size
+)
+
+for idx, batch in enumerate(train_dataloader):
+    print('Batch index: ', idx)
+    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
+    print('Batch label: ', batch['labels'])           # Access 'labels' field
+    break
+
+model = BertForSequenceClassification.from_pretrained(
+    model_name,
+    num_labels = 2, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.   
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+#trainer = Trainer(
+#    model=model,                         # the instantiated 🤗 Transformers model to be trained
+#    args=training_args,                  # training arguments, defined above
+#    train_dataset=train_dataset,         # training dataset
+#    eval_dataset=eval_dataset             # evaluation dataset
+#)
+
+
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+from transformers import get_linear_schedule_with_warmup
+
+# Number of training epochs. The BERT authors recommend between 2 and 4. 
+# We chose to run for 6
+epochs = 6
+
+# Total number of training steps is [number of batches] x [number of epochs]. 
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+import time
+import datetime
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+import random
+
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 12355
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+    #model.cuda()
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+device = torch.device("cpu")
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+#%%
+# Start training
+# We'll store a number of quantities such as training and validation loss, 
+# validation accuracy, and timings.
+training_stats = []
+
+# Measure the total training time for the whole run.
+total_t0 = time.time()
+
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
+    print('Training...')
+    
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    model.to(device)
+    # Reset the total loss for this epoch.
+    total_train_loss = 0
+    # Put the model into training mode. Don't be mislead--the call to 
+    # `train` just changes the *mode*, it doesn't *perform* the training.
+    # `dropout` and `batchnorm` layers behave differently during training
+    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 10 batches.
+        if step % 10 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        print("Batch keys:", batch.keys())
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because 
+        # accumulating the gradients is "convenient while training RNNs". 
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()        
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # The documentation for this `model` function is here: 
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        # It returns different numbers of parameters depending on what arguments
+        # arge given and what flags are set. For our useage here, it returns
+        # the loss (because we provided labels) and the "logits"--the model
+        # outputs prior to activation.
+        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+        loss = output[0]
+        logits = output[1]
+
+        # Accumulate the training loss over all of the batches so that we can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value 
+        # from the tensor.
+        total_train_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)            
+    
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epcoh took: {:}".format(training_time))
+        
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure our performance on
+    # our validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables 
+    total_eval_accuracy = 0
+    total_eval_loss = 0
+    nb_eval_steps = 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using 
+        # the `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+        
+        # Tell pytorch not to bother with constructing the compute graph during
+        # the forward pass, since this is only needed for backprop (training).
+        with torch.no_grad():        
+
+            # Forward pass, calculate logit predictions.
+            # token_type_ids is the same as the "segment ids", which 
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here: 
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+            loss = output[0]
+            logits = output[1]
+            
+        # Accumulate the validation loss.
+        total_eval_loss += loss.item()
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+
+        # Calculate the accuracy for this batch of test sentences, and
+        # accumulate it over all batches.
+        total_eval_accuracy += flat_accuracy(logits, label_ids)
+        
+
+    # Report the final accuracy for this validation run.
+    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
+    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
+
+    # Calculate the average loss over all of the batches.
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    
+    # Measure how long the validation run took.
+    validation_time = format_time(time.time() - t0)
+    
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Valid. Accur.': avg_val_accuracy,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+
+print("")
+print("Training complete!")
+
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+
+params = list(model.named_parameters())
+
+print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+print('==== Embedding Layer ====\n')
+
+for p in params[0:5]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== First Transformer ====\n')
+
+for p in params[5:21]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== Output Layer ====\n')
+
+for p in params[-4:]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+
+import os
+
+# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+from datetime import datetime as dt
+
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+now = dt.now().strftime(fTimeFormat)
+
+output_dir = modFakeClassPath + now + "/"
+
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+print("Saving model to %s" % output_dir)
+
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+model_to_save.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+
+# Good practice: save your training arguments together with the trained model
+# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+
+import pandas as pd
+
+# Display floats with two decimal places.
+pd.set_option('display.precision', 2)
+
+# Create a DataFrame from our training statistics.
+df_stats = pd.DataFrame(data=training_stats)
+
+# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
+df_stats = df_stats.set_index('epoch')
+
+# A hack to force the column headers to wrap.
+#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+
+
+# Display the table.
+df_stats
+df_stats.to_csv(output_dir + now + ".csv")
--- a/trainTopic.py
+++ b/trainTopic.py
@ -0,0 +1,607 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 12 12:25:18 2023
+
+@author: michael
+"""
+#from datasets import load_dataset
+#from transformers import Trainer
+#from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import torch 
+import numpy as np
+from sklearn.model_selection import train_test_split # pip install scikit-learn
+
+import pandas as pd
+
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Training CSV dataset
+twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
+twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
+twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
+statsTrainingTopicClass = "statsTopicClassification-"
+
+# don't change this one
+twtCSVPath = wd + ud + twtCSV + ".csv"
+twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
+twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
+
+statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
+
+twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
+twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
+twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
+twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
+
+twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
+twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
+
+seed = 12355
+
+# Model paths
+modCovClassPath = wd + "models/CovClass/"
+modFakeClassPath = wd + "models/FakeClass/"
+
+model_name = "bvrau/covid-twitter-bert-v2-struth"
+model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
+
+# More models for fake detection:
+# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+max_length = 64 # max token sentence length
+
+#%%
+# Create training and testing dataset
+dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
+
+#dfTest = dfTest[:-900] # remove last 800 rows
+#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
+
+dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
+
+dfTest.drop(columns=['rawContent'], inplace=True)
+
+# Only keep tweets that are longer than 3 words
+dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
+dfTest['tweet_proc_length'].value_counts()
+dfTest = dfTest[dfTest['tweet_proc_length']>3]
+dfTest = dfTest.drop_duplicates(subset=['text'])
+dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
+
+# Create datasets for each classification
+dfCovClass = dfTest
+dfFakeClass = dfTest
+dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
+dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
+
+#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
+dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
+dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
+
+#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
+dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
+dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
+
+#%%
+# Tokenize tweets
+dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
+dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
+dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+
+def encode_labels(label):
+    if label == 'Covid':
+        return 1
+    elif label == 'NonCovid':
+        return 0
+    elif label == 'Fake':
+        return 1
+    elif label == 'True':
+        return 0
+    return 0
+dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
+dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
+
+# get n of classes
+print("# of Non-Covid tweets (coded 0):")
+print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
+
+print("# of Fake-news tweets (coded 1):")
+print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
+
+# create disproportionate sample - 50/50 of both
+#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
+#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
+# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
+# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
+
+'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
+dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
+dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
+dfCovClassab.reset_index(inplace=True)
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
+'''
+
+# create training and validation samples
+dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
+
+# reset index and drop unnecessary columns
+dfCovClass_train.reset_index(drop=True, inplace=True)
+dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
+dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+dfCovClass_test.reset_index(drop=True, inplace=True)
+dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
+dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
+
+# save dfs as csvs and tsvs, for training and validation
+# covid classification datafiles
+# rows 0-41 = noncovid, 42-81 covid, therfore:
+#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
+#dfCovClass.reset_index(inplace=True, drop=True)
+#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
+#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
+#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
+
+# fake news classification datafiles
+#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
+#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
+#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
+
+#%%
+# Prepare trainer
+#from transformers import TrainingArguments
+
+#training_args = TrainingArguments(
+#     report_to = 'wandb',
+#    output_dir=wd+'results',          # output directory/
+#   overwrite_output_dir = True,
+#    num_train_epochs=6,              # total number of training epochs
+#    per_device_train_batch_size=8,  # batch size per device during training
+#    per_device_eval_batch_size=16,   # batch size for evaluation
+#    learning_rate=2e-5,
+#    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
+#    weight_decay=0.01,               # strength of weight decay
+#    logging_dir='./logs3',            # directory for storing logs
+#    logging_steps=1000,
+#    evaluation_strategy="epoch",
+#    save_strategy="epoch",
+#    load_best_model_at_end=True
+#)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+from transformers import BertForSequenceClassification, AdamW#, BertConfig
+#from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+
+"""
+train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
+train_dataset = train_dataset['train'] 
+eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
+eval_dataset = eval_dataset['test'] 
+"""
+batch_size = 1
+
+from torch.utils.data import Dataset
+
+class PandasDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_length):
+        self.dataframe = dataframe
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, index):
+        row = self.dataframe.iloc[index]
+        text = row['text']
+        labels = row['labels_encoded']
+        
+        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
+        input_ids = torch.tensor(encoded['input_ids'])
+        attention_mask = torch.tensor(encoded['attention_mask'])
+        
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(labels)  # Assuming labels are already encoded
+        }
+
+
+train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
+train_dataloader = DataLoader(
+    train_dataset,
+    sampler=RandomSampler(train_dataset),
+    batch_size=batch_size
+)
+
+eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
+validation_dataloader = DataLoader(
+    eval_dataset,
+    sampler=SequentialSampler(eval_dataset),
+    batch_size=batch_size
+)
+
+for idx, batch in enumerate(train_dataloader):
+    print('Batch index: ', idx)
+    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
+    print('Batch label: ', batch['labels'])           # Access 'labels' field
+    break
+
+model = BertForSequenceClassification.from_pretrained(
+    model_name,
+    num_labels = 2, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.   
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+#trainer = Trainer(
+#    model=model,                         # the instantiated 🤗 Transformers model to be trained
+#    args=training_args,                  # training arguments, defined above
+#    train_dataset=train_dataset,         # training dataset
+#    eval_dataset=eval_dataset             # evaluation dataset
+#)
+
+
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
+# I believe the 'W' stands for 'Weight Decay fix"
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+from transformers import get_linear_schedule_with_warmup
+
+# Number of training epochs. The BERT authors recommend between 2 and 4. 
+# We chose to run for 6
+epochs = 6
+
+# Total number of training steps is [number of batches] x [number of epochs]. 
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+# Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+import time
+import datetime
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+import random
+
+# This training code is based on the `run_glue.py` script here:
+# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 12355
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+    #model.cuda()
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+device = torch.device("cpu")
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+#%%
+# Start training
+# We'll store a number of quantities such as training and validation loss, 
+# validation accuracy, and timings.
+training_stats = []
+
+# Measure the total training time for the whole run.
+total_t0 = time.time()
+
+# For each epoch...
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
+    print('Training...')
+    
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+    model.to(device)
+    # Reset the total loss for this epoch.
+    total_train_loss = 0
+    # Put the model into training mode. Don't be mislead--the call to 
+    # `train` just changes the *mode*, it doesn't *perform* the training.
+    # `dropout` and `batchnorm` layers behave differently during training
+    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 10 batches.
+        if step % 10 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            
+            # Report progress.
+            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        print("Batch keys:", batch.keys())
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because 
+        # accumulating the gradients is "convenient while training RNNs". 
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()        
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # The documentation for this `model` function is here: 
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        # It returns different numbers of parameters depending on what arguments
+        # arge given and what flags are set. For our useage here, it returns
+        # the loss (because we provided labels) and the "logits"--the model
+        # outputs prior to activation.
+        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+        loss = output[0]
+        logits = output[1]
+
+        # Accumulate the training loss over all of the batches so that we can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value 
+        # from the tensor.
+        total_train_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)            
+    
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epcoh took: {:}".format(training_time))
+        
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure our performance on
+    # our validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables 
+    total_eval_accuracy = 0
+    total_eval_loss = 0
+    nb_eval_steps = 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        
+        # Unpack this training batch from our dataloader. 
+        #
+        # As we unpack the batch, we'll also copy each tensor to the GPU using 
+        # the `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        b_input_ids = batch['input_ids'].to(device)
+        b_input_mask = batch['attention_mask'].to(device)
+        b_labels = batch['labels'].to(device)
+        
+        # Tell pytorch not to bother with constructing the compute graph during
+        # the forward pass, since this is only needed for backprop (training).
+        with torch.no_grad():        
+
+            # Forward pass, calculate logit predictions.
+            # token_type_ids is the same as the "segment ids", which 
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here: 
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
+            loss = output[0]
+            logits = output[1]
+            
+        # Accumulate the validation loss.
+        total_eval_loss += loss.item()
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+
+        # Calculate the accuracy for this batch of test sentences, and
+        # accumulate it over all batches.
+        total_eval_accuracy += flat_accuracy(logits, label_ids)
+        
+
+    # Report the final accuracy for this validation run.
+    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
+    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
+
+    # Calculate the average loss over all of the batches.
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    
+    # Measure how long the validation run took.
+    validation_time = format_time(time.time() - t0)
+    
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Valid. Accur.': avg_val_accuracy,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+
+print("")
+print("Training complete!")
+
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+
+params = list(model.named_parameters())
+
+print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+print('==== Embedding Layer ====\n')
+
+for p in params[0:5]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== First Transformer ====\n')
+
+for p in params[5:21]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+print('\n==== Output Layer ====\n')
+
+for p in params[-4:]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+
+import os
+
+# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+from datetime import datetime as dt
+
+fTimeFormat = "%Y-%m-%d_%H-%M-%S"
+now = dt.now().strftime(fTimeFormat)
+
+output_dir = modCovClassPath + now + "/"
+
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+print("Saving model to %s" % output_dir)
+
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+model_to_save.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+
+# Good practice: save your training arguments together with the trained model
+# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+
+import pandas as pd
+
+# Display floats with two decimal places.
+pd.set_option('display.precision', 2)
+
+# Create a DataFrame from our training statistics.
+df_stats = pd.DataFrame(data=training_stats)
+
+# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
+df_stats = df_stats.set_index('epoch')
+
+# A hack to force the column headers to wrap.
+#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+
+
+# Display the table.
+df_stats
+df_stats.to_csv(output_dir + now + ".csv")
Author	SHA1	Message	Date
Michael Beck	89b4755c65	adds link to full package to readme	2023-08-31 01:23:38 +02:00
Michael Beck	01e58b1b99	adds html files to gitignore	2023-08-31 01:21:31 +02:00
Michael Beck	d0fcefedf4	data/OUT/profiles/CovTweets.html gelöscht	2023-08-31 01:20:39 +02:00
Michael Beck	71cf907249	data/OUT/profiles/AllTweets.html gelöscht	2023-08-31 01:20:31 +02:00
Michael Beck	a9018fedee	REALLY corrects the filetree	2023-08-30 21:54:13 +02:00
Michael Beck	d94a93295f	corrects filetree	2023-08-30 21:53:05 +02:00
Michael Beck	80b63b39df	adds readme	2023-08-30 21:45:38 +02:00
Michael Beck	d8136909c8	corrects import of own functions that didn't work anymore because of a newer python version.	2023-08-30 21:45:27 +02:00
Michael Beck	1c6d9d5415	cleans and renames files	2023-08-30 21:18:55 +02:00
Michael Beck	4e08cde317	finishes classification scripts	2023-08-16 10:06:16 +02:00
Michael Beck	2535683cdc	finishes classification scripts	2023-08-15 14:51:28 +02:00
Michael Beck	8f744a08be	adds final counter keywords	2023-08-15 14:30:40 +02:00
Michael Beck	df5fd51a5f	repairs stupid	2023-08-15 14:30:13 +02:00
Michael Beck	3d4f559d2d	adds model training stats	2023-08-15 14:29:42 +02:00
Michael Beck	2e067b6a64	adds both classification scripts. Corrects inclusion of CleanTweets functions.	2023-08-15 14:23:56 +02:00
Michael Beck	7a16526a97	adds dataset profiles	2023-08-15 14:20:13 +02:00
Michael Beck	b89b5969ec	adds typerror controls	2023-08-15 14:19:33 +02:00
Michael Beck	7c6b618272	adds both training scripts and evaluation files of topic classification	2023-08-15 14:19:08 +02:00
Michael Beck	90aa58239c	adds generation of model-training dataset	2023-08-14 15:37:30 +02:00
Michael Beck	1beff96ae9	adds model training code	2023-08-14 15:37:05 +02:00
Michael Beck	881d3d6d6d	adds tweet-text-cleaning functions	2023-08-14 15:36:46 +02:00
Michael Beck	5a63c478e9	adds dataset profiler	2023-08-08 15:32:12 +02:00
Michael Beck	ed61d52182	adds files to gitignore	2023-08-08 00:07:42 +02:00
Michael Beck	a26d150060	renames pretest classification file	2023-08-08 00:06:18 +02:00
Michael Beck	d791e4a293	adds classification file. adds removal of empty tweets after transormation for classification preparation	2023-08-08 00:04:14 +02:00
Michael Beck	d57b7a31b7	adds more counter keywords	2023-08-08 00:03:30 +02:00
Michael Beck	13d80124d3	adds lines with counterKeywords to remove non-covid tweets	2023-08-07 23:45:11 +02:00
Michael Beck	3de6d8f3ec	adds tweetLen column, converts keywords to lowercase and removes certain keywords	2023-08-07 23:07:29 +02:00
Michael Beck	899a99ba72	adds CleanTweets functions, creates Graphs	2023-07-07 18:18:51 +02:00
Michael Beck	817ec48478	corrects a lot of mistakes. adds keywords adds analyze.py adds pretest adds pretest ids	2023-07-07 00:16:44 +02:00
Michael Beck	c64904a64d	adds cleanTweets.py	2023-06-26 23:51:32 +02:00
Michael Beck	82830f13e2	„README.md“ ändern	2023-06-26 13:12:16 +02:00
Michael Beck	8c8a191952	„README.md“ hinzufügen	2023-06-26 13:12:04 +02:00
Michael Beck	71e10a62d3	adds senator data scraper	2023-06-23 23:53:31 +02:00
Michael Beck	90d5501ec8	adds comment	2023-06-23 23:53:01 +02:00