Compare commits
35 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
89b4755c65 | ||
![]() |
01e58b1b99 | ||
![]() |
d0fcefedf4 | ||
![]() |
71cf907249 | ||
![]() |
a9018fedee | ||
![]() |
d94a93295f | ||
![]() |
80b63b39df | ||
![]() |
d8136909c8 | ||
![]() |
1c6d9d5415 | ||
![]() |
4e08cde317 | ||
![]() |
2535683cdc | ||
![]() |
8f744a08be | ||
![]() |
df5fd51a5f | ||
![]() |
3d4f559d2d | ||
![]() |
2e067b6a64 | ||
![]() |
7a16526a97 | ||
![]() |
b89b5969ec | ||
![]() |
7c6b618272 | ||
![]() |
90aa58239c | ||
![]() |
1beff96ae9 | ||
![]() |
881d3d6d6d | ||
![]() |
5a63c478e9 | ||
![]() |
ed61d52182 | ||
![]() |
a26d150060 | ||
![]() |
d791e4a293 | ||
![]() |
d57b7a31b7 | ||
![]() |
13d80124d3 | ||
![]() |
3de6d8f3ec | ||
![]() |
899a99ba72 | ||
![]() |
817ec48478 | ||
![]() |
c64904a64d | ||
![]() |
82830f13e2 | ||
![]() |
8c8a191952 | ||
![]() |
71e10a62d3 | ||
![]() |
90d5501ec8 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -2,6 +2,8 @@
|
||||
**/*lock*
|
||||
**/*-slice*.csv
|
||||
**/*.zip
|
||||
**/*.html
|
||||
**/*.htm
|
||||
/ALL-SENATORS-LONG.csv
|
||||
/ALL-SENATORS.csv
|
||||
/collect2.py
|
||||
|
1
.vscode/.gitignore
vendored
Normal file
1
.vscode/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/settings.json
|
123
ClassificationFake.py
Normal file
123
ClassificationFake.py
Normal file
@ -0,0 +1,123 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
|
||||
#%%
|
||||
# prepare & define paths
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "Tweets-Classified-Topic-Results.csv"
|
||||
|
||||
# Name of Classify datafile
|
||||
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
||||
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
|
||||
#%%
|
||||
# get datafra,e
|
||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||
def encode_labels(label):
|
||||
if label == 'True':
|
||||
return 'False'
|
||||
elif label == 'False':
|
||||
return 'True'
|
||||
return 0
|
||||
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
|
||||
|
||||
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
|
||||
|
||||
# dataframe from csv
|
||||
dfClassify['fake'] = False
|
||||
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
|
||||
#%%
|
||||
# remove empty rows
|
||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||
|
||||
# %%from datetime import datetime
|
||||
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfClassify['output_label_fake'] = output_labels
|
||||
dfClassify['output_score_fake'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
123
ClassificationTopic.py
Normal file
123
ClassificationTopic.py
Normal file
@ -0,0 +1,123 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
|
||||
#%%
|
||||
# prepare & define paths
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
||||
|
||||
# Name of Classify datafile
|
||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
|
||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
|
||||
#%%
|
||||
# get datafra,e
|
||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
# dataframe from csv
|
||||
dfClassify['fake'] = False
|
||||
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
#%%
|
||||
# remove empty rows
|
||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||
|
||||
# %%from datetime import datetime
|
||||
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfClassify['output_label_topicCov'] = output_labels
|
||||
dfClassify['output_score_topicCov'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
||||
## corrections
|
||||
def encode_labels(label):
|
||||
if label == 'real':
|
||||
return 'True'
|
||||
elif label == 'fake':
|
||||
return 'False'
|
||||
return 0
|
||||
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
#still wrong, will be corrected in ClassificationFake.py
|
||||
|
131
README.md
Normal file
131
README.md
Normal file
@ -0,0 +1,131 @@
|
||||
# Requirements
|
||||
|
||||
- python 3.10+
|
||||
- snscrape 0.6.2.20230321+ (see git repo in this folder)
|
||||
- transformers 4.31.0
|
||||
- numpy 1.23.5
|
||||
- pandas 2.0.3
|
||||
- scikit-learn 1.3.0
|
||||
- torch 2.0.1
|
||||
|
||||
# About
|
||||
|
||||
This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
|
||||
Training only works with a prepared dataset in which the tweets are pre classified.
|
||||
More info in the comments of the scripts.
|
||||
Due to time constraints, most of the code is procedurally coded and ugly but effective.
|
||||
|
||||
# How to
|
||||
|
||||
Tested on Ubuntu 22.04.
|
||||
If needed, the virual environment can be exported and send to you.
|
||||
|
||||
All files in the folder data/in have to exist in order to execute the scripts.
|
||||
Execute in the following order:
|
||||
|
||||
01 collect.py (see more for further info on scraping)
|
||||
02 collectSenData.py
|
||||
03 cleanTweets
|
||||
04 preTestClassification.py
|
||||
05 trainTopic.py
|
||||
06 trainFake.py
|
||||
07 ClassificationFake.py
|
||||
08 ClassificationTopic.py
|
||||
|
||||
# Files & Folders
|
||||
|
||||
Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
|
||||
|
||||
```
|
||||
├── data
|
||||
│ ├── IN
|
||||
│ │ ├── counterKeywordsFinal.txt
|
||||
│ │ ├── counterKeywords.txt
|
||||
│ │ ├── keywords-raw.txt
|
||||
│ │ ├── keywords.txt
|
||||
│ │ ├── own_keywords.txt
|
||||
│ │ ├── pretest-tweets_fake.txt contains tweet ids for pretest
|
||||
│ │ ├── pretest-tweets_not_fake.txt contains tweet ids for pretest
|
||||
│ │ └── senators-raw.csv senator datafile
|
||||
│ ├── OUT
|
||||
│ │ ├── ALL-SENATORS-TWEETS.csv
|
||||
│ │ ├── graphs
|
||||
│ │ │ ├── Timeline.png
|
||||
│ │ │ ├── Wordcloud-All.png
|
||||
│ │ │ └── Wordcloud-Cov.png
|
||||
│ │ ├── Pretest-Prep.csv
|
||||
│ │ ├── Pretest-Results.csv
|
||||
│ │ ├── Pretest-SENATORS-TWEETS.csv
|
||||
│ │ ├── profiles dataset profiles
|
||||
│ │ │ ├── AllTweets.html
|
||||
│ │ │ └── CovTweets.html
|
||||
│ │ ├── SenatorsTweets-Final.csv
|
||||
│ │ ├── SenatorsTweets-OnlyCov.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassification.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassificationTRAIN.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassification.tsv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassification.csv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassification.tsv
|
||||
│ │ ├── SenatorsTweets-Training.csv
|
||||
│ │ ├── SenatorsTweets-Training_WORKING-COPY.csv
|
||||
│ │ ├── topClass-PRETEST-Prep.csv
|
||||
│ │ ├── topClass-PRETEST-Results.csv
|
||||
│ │ ├── Tweets-All-slices.zip
|
||||
│ │ ├── Tweets-Classified-Fake-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Fake-Results.csv
|
||||
│ │ ├── Tweets-Classified-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Topic-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Topic-Results.csv
|
||||
│ │ └── Tweets-Stub.csv
|
||||
├── funs
|
||||
│ ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing
|
||||
│ ├── ClearDupes.py function for deletion of duplicate keywords
|
||||
│ ├── __init__.py
|
||||
│ ├── Scrape.py scraper functions to be used for multiprocessing
|
||||
│ └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
|
||||
├── log logs of the scraping process
|
||||
│ ├── log_2023-06-23_21-06-10_err.log
|
||||
│ ├── log_2023-06-23_21-06-10.log
|
||||
│ └── log_2023-06-23_21-06-10_missing.log
|
||||
├── models
|
||||
│ ├── CovClass Covid tweet classification model
|
||||
│ │ └── 2023-08-15_05-56-50
|
||||
│ │ ├── 2023-08-15_05-56-50.csv training output
|
||||
│ │ ├── config.json
|
||||
│ │ ├── pytorch_model.bin
|
||||
│ │ ├── special_tokens_map.json
|
||||
│ │ ├── tokenizer_config.json
|
||||
│ │ ├── tokenizer.json
|
||||
│ │ └── vocab.txt
|
||||
│ └── FakeClass Fake tweet classification model
|
||||
│ └── 2023-08-15_14-35-43
|
||||
│ ├── 2023-08-15_14-35-43.csv training output
|
||||
│ ├── config.json
|
||||
│ ├── pytorch_model.bin
|
||||
│ ├── special_tokens_map.json
|
||||
│ ├── tokenizer_config.json
|
||||
│ ├── tokenizer.json
|
||||
│ └── vocab.txt
|
||||
├── snscrape contains snscrape 0.6.2.20230321+ git repo
|
||||
├── ClassificationFake.py classifies tweets as fake or non-fake, saves:
|
||||
│ Tweets-Classified-Fake-Prep.csv - prepared training dataset
|
||||
│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results
|
||||
├── ClassificationTopic.py classifies tweet topic, saves:
|
||||
│ Tweets-Classified-Topic-Prep.csv - prepared training dataset
|
||||
│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results
|
||||
├── cleanTweets.py Curates keywordlists
|
||||
│ Merges senator and tweet datasets
|
||||
│ Creates multiple datasets:
|
||||
│ SenatorsTweets-Final.csv - all tweets with keyword columns
|
||||
│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist
|
||||
│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
|
||||
├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
|
||||
├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv
|
||||
├── createGraphs.py creates wordcloud & timeline graphs
|
||||
├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
|
||||
├── profiler.py creates dataset profiles
|
||||
├── README.md readme
|
||||
├── trainFake.py training script for the fake tweet classification model
|
||||
└── trainTopic.py training script for the tweet topic classification model
|
||||
```
|
233
cleanTweets.py
Normal file
233
cleanTweets.py
Normal file
@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 26 20:36:43 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
# import pyreadstat
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
# Seet for training dataset generation
|
||||
seed = 86431891
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
senCSVcTrain = "SenatorsTweets-Training"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
## Import own functions
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
from ClearDupes import deDupe
|
||||
|
||||
mixed_columns = df.columns[df.nunique() != len(df)]
|
||||
print(mixed_columns)
|
||||
|
||||
df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
|
||||
del df[df.columns[0]] # remove first col
|
||||
|
||||
df['user.created'] = pd.to_datetime(df['user.created'])
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
#%%
|
||||
# sort and generate id
|
||||
df = df.sort_values(by='date').reset_index() # sort df by date before generating id
|
||||
df["tid"] = df.index + 1 # create id column
|
||||
|
||||
#%%
|
||||
# move id column to front
|
||||
cols = list(df.columns.values) # Make a list of all of the columns in the df
|
||||
cols.pop(cols.index('tid')) # Remove id from list
|
||||
#cols.pop(cols.index('user')) # Remove id from list
|
||||
df = df[['tid']+cols] # Create new dataframe with ordered colums
|
||||
|
||||
#%%
|
||||
###################
|
||||
# Keywords
|
||||
# read additional keywords from a file and write to list.
|
||||
keywords = []
|
||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
|
||||
# Read the keywords from a file
|
||||
with open(f"{di}own_keywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
# write all keywords to file
|
||||
with open(f"{di}keywords-raw.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
|
||||
# delete keywords ppe and china that lead to too many false positives
|
||||
removeWords = {'ppe', 'china'}
|
||||
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
||||
|
||||
with open(f"{di}keywords.txt", "w") as file:
|
||||
print("read keyword files")
|
||||
for line in keywords:
|
||||
file.write(f'{line}\n')
|
||||
|
||||
# counter keywords
|
||||
# Read the keywords from a file
|
||||
counterKeywords = []
|
||||
with open(f"{di}counterKeywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
counterKeyword = line.strip() # Remove the newline character
|
||||
counterKeywords.append(counterKeyword)
|
||||
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
|
||||
print("read keyword files")
|
||||
for line in counterKeywords:
|
||||
file.write(f'{line}\n')
|
||||
|
||||
#%%
|
||||
# overwrite keyword column
|
||||
df['keywords'] = np.nan
|
||||
df['keywords'] = (
|
||||
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||
)
|
||||
df['counterKeywords'] = np.nan
|
||||
df['counterKeywords'] = (
|
||||
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||
)
|
||||
#%%
|
||||
# create boolean contains_keyword column
|
||||
df['contains_keyword'] = True
|
||||
df['contains_counterKeyword'] = True
|
||||
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||
|
||||
#%%
|
||||
pd.Series(df["user.id"]).is_unique
|
||||
|
||||
#%%
|
||||
# Merge Datasets
|
||||
# get senator data
|
||||
cols = [
|
||||
"name",
|
||||
"id",
|
||||
"state_short",
|
||||
"party",
|
||||
"class",
|
||||
"ideology",
|
||||
"start_serving",
|
||||
"end_serving",
|
||||
"time_in_office",
|
||||
"not_in_office",
|
||||
"last_congress",
|
||||
"vote_share",
|
||||
"next_closest_share",
|
||||
"election_year",
|
||||
"twitter_handle",
|
||||
"alt_handle",
|
||||
"date_of_birth",
|
||||
"female",
|
||||
"ethnicity",
|
||||
"edu_level",
|
||||
"edu_information",
|
||||
"occup_level"]
|
||||
|
||||
dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
|
||||
dfSenA['alt'] = False
|
||||
dfSenB['alt'] = True
|
||||
|
||||
dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
|
||||
dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
|
||||
dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
|
||||
|
||||
dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
|
||||
dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
|
||||
df['user.username'] = df['user.username'].apply(str.lower)
|
||||
|
||||
dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
|
||||
|
||||
# %%
|
||||
# see if all senators are present in file
|
||||
dfAll = df.merge(dfSenAll, how='left',on='user.username')
|
||||
#check merge
|
||||
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
||||
print(unique_usernames)
|
||||
# senatorisakson was dropped, is ok
|
||||
#%%
|
||||
# create covidtweets csv
|
||||
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
||||
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||
|
||||
#%%
|
||||
# create column with tweet length
|
||||
|
||||
dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
|
||||
|
||||
# reset df index and write to id column
|
||||
dfCov.reset_index(drop=True, inplace=True)
|
||||
|
||||
#%%
|
||||
# Export to csv, sav and dta
|
||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||
dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
|
||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||
# =============================================================================
|
||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||
# dfAllStata = dfAll.rename(columns={'class':'class_'})
|
||||
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
|
||||
# print(dfAllStata.columns)
|
||||
# ====================================================df.id.str.len().value_counts()
|
||||
# =========================
|
||||
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = pd.dfCov(np.random.rand(1800))
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
|
||||
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
|
||||
dfTrain['topicCovid'] = True
|
||||
dfTrain['fake'] = False
|
||||
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
|
17
collect.py
17
collect.py
@ -66,7 +66,6 @@ which is the final output.
|
||||
import os
|
||||
import pandas as pd
|
||||
import glob
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import concurrent.futures
|
||||
@ -91,7 +90,7 @@ file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
||||
path_to_tweetdfs = wd + td
|
||||
|
||||
# Name of logfile
|
||||
logfile = wd+"log/log_"
|
||||
logfile = f"{wd}log/log_"
|
||||
|
||||
###################
|
||||
# Define Timespan & time-format
|
||||
@ -149,10 +148,12 @@ tweetDFColumns = [
|
||||
################## do NOT change anything below this line ###################
|
||||
#############################################################################
|
||||
|
||||
## Import functions
|
||||
from funs.TimeSlice import *
|
||||
from funs.ClearDupes import deDupe
|
||||
from funs.Scrape import scrapeTweets
|
||||
## Import own functions
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
from TimeSlice import get_Tslices
|
||||
from ClearDupes import deDupe
|
||||
from Scrape import scrapeTweets
|
||||
|
||||
###################
|
||||
# Create logfile & log all outputs
|
||||
@ -251,7 +252,7 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w"
|
||||
if file not in tweetfiles:
|
||||
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||
else:
|
||||
fout.write('all slices scraped.')
|
||||
fout.write(f'{file:<30}:all slices scraped.\n')
|
||||
|
||||
## Merge .csv files.
|
||||
# check if file_alltweets (previously scraped tweets that have been merged
|
||||
@ -272,6 +273,8 @@ if tweetfiles:
|
||||
fout.write(f.read())
|
||||
os.chdir(wd) # go back to wd
|
||||
|
||||
###################
|
||||
# finish logging
|
||||
# Report timing info.
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
|
166
collectSenData.py
Normal file
166
collectSenData.py
Normal file
@ -0,0 +1,166 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Jun 23 21:49:11 2023
|
||||
|
||||
@author: Michael
|
||||
|
||||
collectSenData.py scrapes accounts of senators for the following data:the
|
||||
number of followers, the number of users the twitter account is following,
|
||||
and how long the twitter account has existed.
|
||||
|
||||
# Requirements:
|
||||
- snscrape 0.6.2.20230321+
|
||||
- pandas 2.0+
|
||||
# IMPORTANT:
|
||||
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||
versions of snscrape will most likely fail to scrape all tweets because of
|
||||
certain rate limits or other errors that may occur.
|
||||
Install snscrape from local git repo to make shure that it fits the used version.
|
||||
If snscrape is shall be installed from local repo, uncomment the following lines:
|
||||
|
||||
import subprocess
|
||||
os.chdir('snscrape/')
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||
os.chdir(wd)
|
||||
|
||||
|
||||
# How to use:
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import glob
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import concurrent.futures
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
|
||||
# Name of logfile
|
||||
logfile = wd+"log/UserLog_"
|
||||
|
||||
###################
|
||||
# Define Timespan & time-format
|
||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||
no_slices = 24 # Number of slices / time periods.
|
||||
|
||||
# file time format
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
|
||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||
maxTweets = 5000
|
||||
|
||||
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||
# get subparams just like in user where user id can be obtained by user.id
|
||||
userDFColumns = [
|
||||
"id",
|
||||
"username",
|
||||
"followersCount",
|
||||
"friendsCount",
|
||||
"verified",
|
||||
"created"
|
||||
]
|
||||
|
||||
#############################################################################
|
||||
################## do NOT change anything below this line ###################
|
||||
#############################################################################
|
||||
|
||||
from funs.Scrape import scrapeUsers, getHandles, printHandles
|
||||
from funs.TimeSlice import convertTime
|
||||
|
||||
|
||||
###################
|
||||
# Create logfile & log all outputs
|
||||
# there are three logfile types to be found in /log.
|
||||
# should be self explanatory.
|
||||
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
|
||||
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
|
||||
sys.stderr = open(logfileErrors, "w")
|
||||
sys.stdout = open(logfilen, "w")
|
||||
|
||||
|
||||
###################
|
||||
# Senator Accounts
|
||||
# Get accounts & alt-accounts from Senators-Datafile
|
||||
accounts = getHandles(di)
|
||||
|
||||
# Print accounts to be scraped
|
||||
print(printHandles(accounts))
|
||||
|
||||
###################
|
||||
# Scraping
|
||||
# report time:
|
||||
timeStartScrape = datetime.now()
|
||||
print("Starting scraping at:")
|
||||
print(timeStartScrape.strftime(fTimeFormat))
|
||||
print("---")
|
||||
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
listUsers = []
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
# List to store the scraping tasks
|
||||
tasks = []
|
||||
for handle in accounts:
|
||||
# Schedule the scraping task
|
||||
task = executor.submit(
|
||||
scrapeUsers, handle, userDFColumns
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete and retrieve results
|
||||
for task in concurrent.futures.as_completed(tasks):
|
||||
result = task.result()
|
||||
listUsers.append(result)
|
||||
|
||||
dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
|
||||
dfUsers.to_csv(senCSVPath, encoding='utf-8')
|
||||
|
||||
# report time:
|
||||
timeEndScrape = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndScrape.strftime(fTimeFormat))
|
||||
|
||||
# Report timing info.
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndMerge.strftime(fTimeFormat))
|
||||
print("---")
|
||||
# calulate times:
|
||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
|
||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
|
||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
|
||||
print(
|
||||
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
||||
)
|
||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||
|
||||
print(listUsers)
|
||||
# close connection to logfiles.
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
144
createGraphs.py
Normal file
144
createGraphs.py
Normal file
@ -0,0 +1,144 @@
|
||||
#%%
|
||||
#!/usr/bin/env python3
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from wordcloud import WordCloud
|
||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||
import string
|
||||
#%%
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 26 20:36:43 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
# import pyreadstat
|
||||
# import numpy as np
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final.csv"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov.csv"
|
||||
|
||||
# Outfiles
|
||||
wcAllTweetsF = "graphs/Wordcloud-All.png"
|
||||
wcCovTweetsF = "graphs/Wordcloud-Cov.png"
|
||||
TwCovTimeline = "graphs/Timeline.png"
|
||||
|
||||
# don't change this one
|
||||
senCSVcPath = wd + ud + senCSVc
|
||||
senCSVcCovPath = wd + ud + senCSVcCov
|
||||
wcAllTweetsFPath = wd + ud + wcAllTweetsF
|
||||
wcCovTweetsFPath = wd + ud + wcCovTweetsF
|
||||
TwCovTimelinePath = wd + ud + TwCovTimeline
|
||||
|
||||
#%%
|
||||
df = pd.read_csv(senCSVcPath, dtype=(object))
|
||||
dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
|
||||
#%%
|
||||
df['cleanContent'] = df['rawContent'].apply(remove_URL)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_html)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_punct)
|
||||
|
||||
# create string with all cleaned tweets as text
|
||||
str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||
#%%
|
||||
dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
|
||||
|
||||
# create string with all cleaned tweets as text
|
||||
str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||
#%%
|
||||
# replace single U and S characters
|
||||
str_covtweets = str_covtweets.replace(' u ', ' ')
|
||||
str_covtweets = str_covtweets.replace(' s ', ' ')
|
||||
str_alltweets = str_alltweets.replace(' u ', ' ')
|
||||
str_alltweets = str_alltweets.replace(' s ', ' ')
|
||||
|
||||
|
||||
# %%
|
||||
# create wordcloud alltweets
|
||||
wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||
wcA.generate(str_alltweets)
|
||||
|
||||
#%%
|
||||
# draw
|
||||
plt.figure( figsize=(20,20))
|
||||
plt.axis("off")
|
||||
plt.imshow(wcA, interpolation="bilinear")
|
||||
fig1 = plt.gcf()
|
||||
plt.show()
|
||||
fig1.savefig(wcAllTweetsFPath)
|
||||
|
||||
# %%
|
||||
# create wordcloud covtweets
|
||||
wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||
wcC.generate(str_covtweets)
|
||||
#%%
|
||||
# draw
|
||||
plt.figure( figsize=(20,20))
|
||||
plt.axis("off")
|
||||
plt.imshow(wcC, interpolation="bilinear")
|
||||
fig2 = plt.gcf()
|
||||
plt.show()
|
||||
fig2.savefig(wcCovTweetsFPath)
|
||||
# %%
|
||||
# with open('test.txt', 'w') as f:
|
||||
# f.write(str_covtweets)
|
||||
# %%
|
||||
dfT = pd.DataFrame()
|
||||
dfT['date'] = df['date'].copy()
|
||||
dfT['count'] = 1
|
||||
|
||||
dfCovT = pd.DataFrame()
|
||||
dfCovT['date'] = dfCov['date'].copy()
|
||||
dfCovT['count'] = 1
|
||||
#%%
|
||||
dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
|
||||
dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
|
||||
|
||||
#%%
|
||||
dfT = dfT.groupby('date').count().reset_index()
|
||||
dfCovT = dfCovT.groupby('date').count().reset_index()
|
||||
|
||||
#%%
|
||||
import matplotlib.dates as mdates
|
||||
# n of tweets overall
|
||||
my_dpi=300
|
||||
plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
|
||||
plt.style.use('seaborn-darkgrid')
|
||||
fig, ax = plt.subplots(figsize=(8, 6))
|
||||
ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
|
||||
ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
|
||||
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
|
||||
ax.xaxis.set_minor_locator(mdates.MonthLocator())
|
||||
fig.autofmt_xdate()
|
||||
fig.savefig(TwCovTimelinePath)
|
||||
|
||||
|
||||
# %%
|
23
data/IN/counterKeywords.txt
Normal file
23
data/IN/counterKeywords.txt
Normal file
@ -0,0 +1,23 @@
|
||||
opioid
|
||||
gun violence
|
||||
gun-violence
|
||||
CHD
|
||||
Coronary heart disease
|
||||
addiction
|
||||
tobacco
|
||||
vaping
|
||||
e-cigarette
|
||||
shooting
|
||||
indigenous women
|
||||
overdose
|
||||
meth
|
||||
cocaine
|
||||
separated children
|
||||
separating children
|
||||
separating families
|
||||
Muslim travel ban
|
||||
flu-season
|
||||
flu season
|
||||
Soleimani
|
||||
Muslim Ban
|
||||
USMCA trade deal
|
23
data/IN/counterKeywordsFinal.txt
Normal file
23
data/IN/counterKeywordsFinal.txt
Normal file
@ -0,0 +1,23 @@
|
||||
meth
|
||||
gun violence
|
||||
flu season
|
||||
vaping
|
||||
chd
|
||||
addiction
|
||||
indigenous women
|
||||
separating children
|
||||
tobacco
|
||||
e-cigarette
|
||||
muslim ban
|
||||
soleimani
|
||||
cocaine
|
||||
separating families
|
||||
muslim travel ban
|
||||
usmca trade deal
|
||||
shooting
|
||||
overdose
|
||||
separated children
|
||||
coronary heart disease
|
||||
gun-violence
|
||||
opioid
|
||||
flu-season
|
190
data/IN/keywords.txt
Normal file
190
data/IN/keywords.txt
Normal file
@ -0,0 +1,190 @@
|
||||
plandemic
|
||||
scamdemic
|
||||
wuhan flu
|
||||
wuhanflu
|
||||
corona
|
||||
coronavirusoutbreak
|
||||
pandemic
|
||||
epidemic
|
||||
vax
|
||||
antivax
|
||||
antivaxxers
|
||||
wearamask
|
||||
masksoff
|
||||
cdc
|
||||
ncov
|
||||
sars-cov-2
|
||||
socialdistancing
|
||||
wear a mask
|
||||
lockdown
|
||||
covd
|
||||
coronavirus
|
||||
koronavirus
|
||||
corona
|
||||
cdc
|
||||
wuhancoronavirus
|
||||
wuhanlockdown
|
||||
ncov
|
||||
wuhan
|
||||
n95
|
||||
kungflu
|
||||
epidemic
|
||||
outbreak
|
||||
sinophobia
|
||||
covid-19
|
||||
corona virus
|
||||
covid
|
||||
covid19
|
||||
sars-cov-2
|
||||
covidー19
|
||||
covd
|
||||
pandemic
|
||||
coronapocalypse
|
||||
canceleverything
|
||||
coronials
|
||||
socialdistancingnow
|
||||
social distancing
|
||||
socialdistancing
|
||||
panicbuy
|
||||
panic buy
|
||||
panicbuying
|
||||
panic buying
|
||||
14dayquarantine
|
||||
duringmy14dayquarantine
|
||||
panic shop
|
||||
panic shopping
|
||||
panicshop
|
||||
inmyquarantinesurvivalkit
|
||||
panic-buy
|
||||
panic-shop
|
||||
coronakindness
|
||||
quarantinelife
|
||||
chinese virus
|
||||
chinesevirus
|
||||
stayhomechallenge
|
||||
stay home challenge
|
||||
sflockdown
|
||||
dontbeaspreader
|
||||
lockdown
|
||||
lock down
|
||||
shelteringinplace
|
||||
sheltering in place
|
||||
staysafestayhome
|
||||
stay safe stay home
|
||||
trumppandemic
|
||||
trump pandemic
|
||||
flattenthecurve
|
||||
flatten the curve
|
||||
china virus
|
||||
chinavirus
|
||||
quarentinelife
|
||||
ppeshortage
|
||||
saferathome
|
||||
stayathome
|
||||
stay at home
|
||||
stay home
|
||||
stayhome
|
||||
getmeppe
|
||||
covidiot
|
||||
epitwitter
|
||||
pandemie
|
||||
wear a mask
|
||||
wearamask
|
||||
kung flu
|
||||
covididiot
|
||||
covid__19
|
||||
omicron
|
||||
variant
|
||||
vaccine
|
||||
travel ban
|
||||
corona
|
||||
corona
|
||||
coronavirus
|
||||
coronavirus
|
||||
covid
|
||||
covid
|
||||
covid19
|
||||
covid19
|
||||
covid-19
|
||||
covid-19
|
||||
sarscov2
|
||||
sarscov2
|
||||
sars cov2
|
||||
sars cov 2
|
||||
covid_19
|
||||
covid_19
|
||||
ncov
|
||||
ncov
|
||||
ncov2019
|
||||
ncov2019
|
||||
2019-ncov
|
||||
2019-ncov
|
||||
pandemic
|
||||
pandemic 2019ncov
|
||||
2019ncov
|
||||
quarantine
|
||||
quarantine
|
||||
flatten the curve
|
||||
flattening the curve
|
||||
flatteningthecurve
|
||||
flattenthecurve
|
||||
hand sanitizer
|
||||
handsanitizer
|
||||
lockdown
|
||||
lockdown
|
||||
social distancing
|
||||
socialdistancing
|
||||
work from home
|
||||
workfromhome
|
||||
working from home
|
||||
workingfromhome
|
||||
n95
|
||||
n95
|
||||
covidiots
|
||||
covidiots
|
||||
herd immunity
|
||||
herdimmunity
|
||||
pneumonia
|
||||
pneumonia
|
||||
chinese virus
|
||||
chinesevirus
|
||||
wuhan virus
|
||||
wuhanvirus
|
||||
kung flu
|
||||
kungflu
|
||||
wearamask
|
||||
wearamask
|
||||
wear a mask
|
||||
vaccine
|
||||
vaccines
|
||||
vaccine
|
||||
vaccines
|
||||
corona vaccine
|
||||
corona vaccines
|
||||
coronavaccine
|
||||
coronavaccines
|
||||
face shield
|
||||
faceshield
|
||||
face shields
|
||||
faceshields
|
||||
health worker
|
||||
healthworker
|
||||
health workers
|
||||
healthworkers
|
||||
stayhomestaysafe
|
||||
coronaupdate
|
||||
frontlineheroes
|
||||
coronawarriors
|
||||
homeschool
|
||||
homeschooling
|
||||
hometasking
|
||||
masks4all
|
||||
wfh
|
||||
wash ur hands
|
||||
wash your hands
|
||||
washurhands
|
||||
washyourhands
|
||||
stayathome
|
||||
stayhome
|
||||
selfisolating
|
||||
self isolating
|
20
data/IN/own_keywords.txt
Normal file
20
data/IN/own_keywords.txt
Normal file
@ -0,0 +1,20 @@
|
||||
plandemic
|
||||
scamdemic
|
||||
wuhan flu
|
||||
wuhanflu
|
||||
corona
|
||||
coronavirusoutbreak
|
||||
pandemic
|
||||
epidemic
|
||||
vax
|
||||
antivax
|
||||
antivaxxers
|
||||
wearamask
|
||||
masksoff
|
||||
cdc
|
||||
ncov
|
||||
sars-cov-2
|
||||
socialdistancing
|
||||
wear a mask
|
||||
lockdown
|
||||
covd
|
50
data/IN/pretest-tweets_fake.txt
Normal file
50
data/IN/pretest-tweets_fake.txt
Normal file
@ -0,0 +1,50 @@
|
||||
1486474031419297799
|
||||
1504880316506263552
|
||||
1264663210197745665
|
||||
1479500294887256069
|
||||
1320058585590734852
|
||||
1539003407096336388
|
||||
1481704942574395392
|
||||
1572014646374154240
|
||||
1524764580806811649
|
||||
1592940763515858944
|
||||
1554529221594292224
|
||||
1479488991347023876
|
||||
1481715928492609541
|
||||
1476722414100914179
|
||||
1478478958740086790
|
||||
1459285859358982148
|
||||
1475620600228028432
|
||||
1479459200229117955
|
||||
1448386057339297797
|
||||
1468993886316077063
|
||||
1448369102318362625
|
||||
1444354461799956482
|
||||
1431340411193331715
|
||||
1583474056011010048
|
||||
1450479481278406658
|
||||
1396992539010469894
|
||||
1396992534623174658
|
||||
1417920232333656076
|
||||
1439553348122861568
|
||||
1598398871990079489
|
||||
1502768541979881479
|
||||
1337604370981134336
|
||||
1417797808707473410
|
||||
1601693432292192256
|
||||
1598145048989704192
|
||||
1599906362380591110
|
||||
1325851780496961538
|
||||
1468908159330885632
|
||||
1468332389923311616
|
||||
1339703372505624577
|
||||
1468633243654451200
|
||||
1488290848907444240
|
||||
1491146722625880064
|
||||
1481766558313730053
|
||||
1503078235373985795
|
||||
1485398845718773762
|
||||
1371501907483754497
|
||||
1494398809245376513
|
||||
1436328255959801865
|
||||
1482862501461209089
|
50
data/IN/pretest-tweets_not_fake.txt
Normal file
50
data/IN/pretest-tweets_not_fake.txt
Normal file
@ -0,0 +1,50 @@
|
||||
1258402212327436288
|
||||
1489758168750174209
|
||||
1303698927766646785
|
||||
1257681474670809090
|
||||
1340109389672411136
|
||||
1303698924444803072
|
||||
1303698926902665218
|
||||
1337595387796983809
|
||||
1344441446515019777
|
||||
1385680800218324992
|
||||
1590129838261956608
|
||||
1303698928609697796
|
||||
1348715183502454793
|
||||
1340418291274289153
|
||||
1421228572732280835
|
||||
1456349962942533637
|
||||
1603457599877308416
|
||||
1278354646885687296
|
||||
1340418294579421188
|
||||
1365866032792039425
|
||||
1472722005657112578
|
||||
1381021635772350464
|
||||
1337598897217220609
|
||||
1354797645261398016
|
||||
1266806429282963456
|
||||
1429847265242460161
|
||||
1234272677633953792
|
||||
1301581247932772352
|
||||
1424832183148204043
|
||||
1339255967809212416
|
||||
1284831896988454912
|
||||
1463528081214394377
|
||||
1453679912938885122
|
||||
1583474059148337152
|
||||
1519791965113622528
|
||||
1470775155110682628
|
||||
1464615554103357450
|
||||
1337595385565638657
|
||||
1436055743418019840
|
||||
1572208051830104069
|
||||
1433765113891328002
|
||||
1482774656075534336
|
||||
1310288545886736384
|
||||
1353845938566156289
|
||||
1396992537202659329
|
||||
1455712525362810883
|
||||
1340384267327647747
|
||||
1338588364459618305
|
||||
1376696928692412419
|
||||
1340386565399429123
|
@ -1,112 +1,111 @@
|
||||
name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female, ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
|
||||
"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander ,LamarAlexander ,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
|
||||
"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi?lang=zh-Hant ,SenatorEnzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
|
||||
name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female,ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
|
||||
"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander,LamarAlexander,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
|
||||
"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi,senatorenzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
|
||||
"Gardner, Cory",3,Colorado,CO,0,2,0.719285383539398,01/06/2015,01/03/2021,5.9972602739726,1,116,48.5,46,2014,https://twitter.com/CoryGardner,CoryGardner,https://twitter.com/corygardner,corygardner,08/22/1974,0,White,8,"J.D.; University of Colorado, Boulder; 2001",2,N/A,https://bioguide.congress.gov/search/bio/G000562,,
|
||||
"Harris, Kamala",4,California ,CA,1,3,0.0213759569468058,01/03/2017,01/18/2021,4.04383561643836,1,116,62.4,37.6,2016,https://twitter.com/VP,VP,https://twitter.com/KamalaHarris,KamalaHarris,10/20/1964,1,African-American; Asian-American,8,J.D.; University of California; 1989,2,N/A,https://bioguide.congress.gov/search/bio/H001075,(became VP on jan 20 2021),
|
||||
"Isakson, John",5,Georgia,GA,0,3,*,01/03/2005,12/31/2019,14,1,116,55,40.8,2016,https://twitter.com/SenatorIsakson ,SenatorIsakson,N/A,N/A,12/28/1944,0,White,6,"University of Georgia, Athens; 1966",1,N/A,https://bioguide.congress.gov/search/bio/I000055,(died in 2019),
|
||||
"Jones, Gordon Douglas",6,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
|
||||
"Loeffler, Kelly",7,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler ,senatorloeffler ,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
|
||||
"McSally, Martha",8,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
|
||||
"Perdue, David",9,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
|
||||
"Roberts, Charles Patrick",10,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
|
||||
"Udall, Tom",11,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
|
||||
"Baldwin, Tammy",12,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
|
||||
"Barrasso, John",13,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
|
||||
"Bennet, Michael F.",14,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
|
||||
"Blackburn, Marsha",15,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
|
||||
"Blumenthal, Richard",16,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
|
||||
"Blunt, Roy",17,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
|
||||
"Booker, Cory A.",18,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
|
||||
"Boozman, John",19,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
|
||||
"Braun, Michael",20,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
|
||||
"Brown, Sherrod",21,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
|
||||
"Burr, Richard",22,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
|
||||
"Cantwell, Maria",23,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
|
||||
"Capito, Shelley Moore",24,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
|
||||
"Cardin, Benjamin L.",25,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
|
||||
"Carper, Thomas R.",26,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
|
||||
"Casey, Robert P., Jr.",27,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
|
||||
"Cassidy, Bill",28,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
|
||||
"Collins, Susan M.",29,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
|
||||
"Coons, Christopher A.",30,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
|
||||
"Cornyn, John",31,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary<72>s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
|
||||
"Cortez Masto, Catherine",32,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
|
||||
"Cotton, Tom",33,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
|
||||
"Cramer, Kevin",34,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
|
||||
"Crapo, Michael",35,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
|
||||
"Cruz, Ted",36,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
|
||||
"Daines, Steve",37,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
|
||||
"Duckworth, Tammy",38,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
|
||||
"Durbin, Richard J.",39,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
|
||||
"Ernst, Joni",40,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
|
||||
"Feinstein, Dianne",41,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
|
||||
"Fischer, Debra",42,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
|
||||
"Gillibrand, Kirsten E.",43,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
|
||||
"Graham, Lindsey",44,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
|
||||
"Grassley, Chuck",45,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
|
||||
"Hagerty, Bill",46,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
|
||||
"Hassan, Margaret Wood",47,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
|
||||
"Hawley, Josh",48,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
|
||||
"Heinrich, Martin",49,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,N/A,N/A,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
|
||||
"Hickenlooper, John W.",50,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
|
||||
"Hirono, Mazie K.",51,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
|
||||
"Hoeven, John",52,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
|
||||
"Hyde-Smith, Cindy",53,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
|
||||
"Inhofe, James",54,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
|
||||
"Johnson, Ron",55,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
|
||||
"Kaine, Tim",56,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
|
||||
"Kelly, Mark",57,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
|
||||
"Kennedy, John Neely",58,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
|
||||
"King, Angus S., Jr.",59,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
|
||||
"Klobuchar, Amy",60,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
|
||||
"Lankford, James",61,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
|
||||
"Leahy, Patrick",62,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
|
||||
"Lee, Mike",63,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
|
||||
"Luj<EFBFBD>n, Ben Ray",64,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
|
||||
"Lummis, Cynthia M.",65,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
|
||||
"Manchin, Joe, III",66,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
|
||||
"Markey, Edward J.",67,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
|
||||
"Marshall, Roger",68,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
|
||||
"McConnell, Mitch",69,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
|
||||
"Menendez, Robert",70,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
|
||||
"Merkley, Jeff",71,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
|
||||
"Moran, Jerry",72,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
|
||||
"Murkowski, Lisa",73,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
|
||||
"Murphy, Christopher",74,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
|
||||
"Murray, Patty",75,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
|
||||
"Ossoff, Jon",76,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
|
||||
"Padilla, Alex",77,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
|
||||
"Paul, Rand",78,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
|
||||
"Peters, Gary C.",79,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
|
||||
"Portman, Robert",80,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
|
||||
"Reed, John F.",81,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
|
||||
"Risch, James E.",82,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
|
||||
"Romney, Mitt",83,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
|
||||
"Rosen, Jacky",84,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
|
||||
"Rounds, Mike",85,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
|
||||
"Rubio, Marco",86,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
|
||||
"Sanders, Bernard",87,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
|
||||
"Sasse, Benjamin",88,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
|
||||
"Schatz, Brian",89,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
|
||||
"Schumer, Charles E.",90,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
|
||||
"Scott, Rick",91,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
|
||||
"Scott, Tim",92,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
|
||||
"Shaheen, Jeanne",93,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
|
||||
"Shelby, Richard",94,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
|
||||
"Sinema, Kyrsten",95,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
|
||||
"Smith, Tina",96,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
|
||||
"Stabenow, Debbie",97,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
|
||||
"Sullivan, Dan",98,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
|
||||
"Tester, Jon",99,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
|
||||
"Thune, John",100,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
|
||||
"Tillis, Thom",101,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
|
||||
"Toomey, Patrick",102,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
|
||||
"Tuberville, Tommy",103,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
|
||||
"Van Hollen, Chris",104,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
|
||||
"Warner, Mark R.",105,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
|
||||
"Warnock, Raphael G.",106,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
|
||||
"Warren, Elizabeth",107,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
|
||||
"Whitehouse, Sheldon",108,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
|
||||
"Wicker, Roger F.",109,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
|
||||
"Wyden, Ron",110,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
|
||||
"Young, Todd",111,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
|
||||
"Jones, Gordon Douglas",5,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
|
||||
"Loeffler, Kelly",6,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler,senatorloeffler,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
|
||||
"McSally, Martha",7,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
|
||||
"Perdue, David",8,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
|
||||
"Roberts, Charles Patrick",9,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
|
||||
"Udall, Tom",10,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
|
||||
"Baldwin, Tammy",11,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
|
||||
"Barrasso, John",12,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
|
||||
"Bennet, Michael F.",13,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
|
||||
"Blackburn, Marsha",14,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
|
||||
"Blumenthal, Richard",15,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
|
||||
"Blunt, Roy",16,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
|
||||
"Booker, Cory A.",17,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
|
||||
"Boozman, John",18,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
|
||||
"Braun, Michael",19,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
|
||||
"Brown, Sherrod",20,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
|
||||
"Burr, Richard",21,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
|
||||
"Cantwell, Maria",22,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
|
||||
"Capito, Shelley Moore",23,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
|
||||
"Cardin, Benjamin L.",24,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
|
||||
"Carper, Thomas R.",25,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
|
||||
"Casey, Robert P., Jr.",26,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
|
||||
"Cassidy, Bill",27,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
|
||||
"Collins, Susan M.",28,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
|
||||
"Coons, Christopher A.",29,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
|
||||
"Cornyn, John",30,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary’s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
|
||||
"Cortez Masto, Catherine",31,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
|
||||
"Cotton, Tom",32,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
|
||||
"Cramer, Kevin",33,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
|
||||
"Crapo, Michael",34,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
|
||||
"Cruz, Ted",35,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
|
||||
"Daines, Steve",36,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
|
||||
"Duckworth, Tammy",37,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
|
||||
"Durbin, Richard J.",38,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
|
||||
"Ernst, Joni",39,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
|
||||
"Feinstein, Dianne",40,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
|
||||
"Fischer, Debra",41,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
|
||||
"Gillibrand, Kirsten E.",42,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
|
||||
"Graham, Lindsey",43,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
|
||||
"Grassley, Chuck",44,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
|
||||
"Hagerty, Bill",45,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
|
||||
"Hassan, Margaret Wood",46,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
|
||||
"Hawley, Josh",47,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
|
||||
"Heinrich, Martin",48,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,https://twitter.com/senatorheinrich,senatorheinrich,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
|
||||
"Hickenlooper, John W.",49,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
|
||||
"Hirono, Mazie K.",50,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
|
||||
"Hoeven, John",51,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
|
||||
"Hyde-Smith, Cindy",52,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
|
||||
"Inhofe, James",53,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
|
||||
"Johnson, Ron",54,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
|
||||
"Kaine, Tim",55,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
|
||||
"Kelly, Mark",56,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
|
||||
"Kennedy, John Neely",57,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
|
||||
"King, Angus S., Jr.",58,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
|
||||
"Klobuchar, Amy",59,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
|
||||
"Lankford, James",60,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
|
||||
"Leahy, Patrick",61,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
|
||||
"Lee, Mike",62,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
|
||||
"Luján, Ben Ray",63,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
|
||||
"Lummis, Cynthia M.",64,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
|
||||
"Manchin, Joe, III",65,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
|
||||
"Markey, Edward J.",66,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
|
||||
"Marshall, Roger",67,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
|
||||
"McConnell, Mitch",68,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
|
||||
"Menendez, Robert",69,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
|
||||
"Merkley, Jeff",70,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
|
||||
"Moran, Jerry",71,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
|
||||
"Murkowski, Lisa",72,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
|
||||
"Murphy, Christopher",73,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
|
||||
"Murray, Patty",74,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
|
||||
"Ossoff, Jon",75,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
|
||||
"Padilla, Alex",76,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
|
||||
"Paul, Rand",77,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
|
||||
"Peters, Gary C.",78,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
|
||||
"Portman, Robert",79,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
|
||||
"Reed, John F.",80,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
|
||||
"Risch, James E.",81,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
|
||||
"Romney, Mitt",82,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
|
||||
"Rosen, Jacky",83,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
|
||||
"Rounds, Mike",84,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
|
||||
"Rubio, Marco",85,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
|
||||
"Sanders, Bernard",86,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
|
||||
"Sasse, Benjamin",87,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
|
||||
"Schatz, Brian",88,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
|
||||
"Schumer, Charles E.",89,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
|
||||
"Scott, Rick",90,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
|
||||
"Scott, Tim",91,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
|
||||
"Shaheen, Jeanne",92,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
|
||||
"Shelby, Richard",93,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
|
||||
"Sinema, Kyrsten",94,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
|
||||
"Smith, Tina",95,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
|
||||
"Stabenow, Debbie",96,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
|
||||
"Sullivan, Dan",97,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
|
||||
"Tester, Jon",98,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
|
||||
"Thune, John",99,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
|
||||
"Tillis, Thom",100,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
|
||||
"Toomey, Patrick",101,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
|
||||
"Tuberville, Tommy",102,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
|
||||
"Van Hollen, Chris",103,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
|
||||
"Warner, Mark R.",104,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
|
||||
"Warnock, Raphael G.",105,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
|
||||
"Warren, Elizabeth",106,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
|
||||
"Whitehouse, Sheldon",107,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
|
||||
"Wicker, Roger F.",108,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
|
||||
"Wyden, Ron",109,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
|
||||
"Young, Todd",110,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
|
|
8
data/OUT/.gitignore
vendored
Normal file
8
data/OUT/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
/ALL-SENATORS-TWEETS.csv
|
||||
/Pretest-Prep.csv
|
||||
/Pretest-Results.csv
|
||||
/Pretest-SENATORS-TWEETS.csv
|
||||
/SenatorsTweets-Final.csv
|
||||
/SenatorsTweets-OnlyCov.csv
|
||||
/Tweets-Classified-Prep.csv
|
||||
/Tweets-Stub.csv
|
3
data/OUT/graphs/.gitignore
vendored
Normal file
3
data/OUT/graphs/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/Timeline.png
|
||||
/Wordcloud-All.png
|
||||
/Wordcloud-Cov.png
|
89
funs/CleanTweets.py
Normal file
89
funs/CleanTweets.py
Normal file
@ -0,0 +1,89 @@
|
||||
import re
|
||||
import string
|
||||
|
||||
def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
|
||||
preprocessed_text = []
|
||||
for t in text.split():
|
||||
if len(t) > 1:
|
||||
t = '@user' if t[0] == '@' and t.count('@') == 1 else t
|
||||
t = 'http' if t.startswith('http') else t
|
||||
preprocessed_text.append(t)
|
||||
return ' '.join(preprocessed_text)
|
||||
|
||||
def remove_URL(text):
|
||||
try:
|
||||
url = re.compile(r'https?://\S+|www\.\S+')
|
||||
except: print(text)
|
||||
return url.sub(r'', text)
|
||||
|
||||
def remove_emoji(text):
|
||||
emoji_pattern = re.compile(
|
||||
'['
|
||||
u'\U0001F600-\U0001F64F' # emoticons
|
||||
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
||||
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
||||
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
||||
u'\U00002702-\U000027B0'
|
||||
u'\U000024C2-\U0001F251'
|
||||
']+',
|
||||
flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r'', text)
|
||||
|
||||
def remove_html(text):
|
||||
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
||||
return re.sub(html, '', text)
|
||||
|
||||
def remove_punct(text):
|
||||
table = str.maketrans('', '', string.punctuation)
|
||||
return text.translate(table)
|
||||
|
||||
def remove_nonascii(text):
|
||||
return re.sub(r'[^\x00-\x7F]+', '', text)
|
||||
|
||||
def remove_spec(text):
|
||||
text = re.sub(r'&?', r'and', text)
|
||||
text = re.sub(r'<', r'<', text)
|
||||
return re.sub(r'>', r'>', text)
|
||||
|
||||
def remove_spaces(text): # also new line chars and to lower case
|
||||
text = re.sub(r'<', r'<', text)
|
||||
text = " ".join(text.splitlines()) # remove newline characters
|
||||
text = text.lower()
|
||||
text = text.strip()
|
||||
return re.sub(r'\s{2,}', ' ', text)
|
||||
|
||||
def remove_retw(text):
|
||||
text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
|
||||
return re.sub(r'@[\S]+', '', text)
|
||||
|
||||
def preprocess_text(text):
|
||||
text = remove_URL(text)
|
||||
text = remove_emoji(text)
|
||||
text = remove_html(text)
|
||||
text = remove_punct(text)
|
||||
text = remove_nonascii(text)
|
||||
text = remove_spec(text)
|
||||
text = remove_spaces(text)
|
||||
text = remove_retw(text)
|
||||
return text
|
||||
|
||||
def preprocess_text_series(series):
|
||||
series = series.apply(remove_URL)
|
||||
series = series.apply(remove_emoji)
|
||||
series = series.apply(remove_html)
|
||||
series = series.apply(remove_punct)
|
||||
series = series.apply(remove_nonascii)
|
||||
series = series.apply(remove_spec)
|
||||
series = series.apply(remove_spaces)
|
||||
series = series.apply(remove_retw)
|
||||
return series
|
||||
|
||||
# Check all functions:
|
||||
input_text = """
|
||||
Check out this amazing website: https://www.example.com! 😃
|
||||
<html>This is an HTML tag.</html>
|
||||
RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
|
||||
This is a test text with lots of punctuations!!! Can't wait to see more...
|
||||
"""
|
||||
processed_text = preprocess_text(input_text)
|
||||
# print(processed_text)
|
@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
|
||||
# save short csv
|
||||
tweet_df.to_csv(csv_path, encoding='utf-8')
|
||||
# sleep 1 second to not get blocked because of excessive requests
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
def getHandles(di):
|
||||
"""grabs accounts from senators-raw.csv
|
||||
|
||||
Args:
|
||||
di (str): path to senators-raw.csv
|
||||
|
||||
Returns:
|
||||
list: list containing str of senator account handles
|
||||
"""
|
||||
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
|
||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||
accounts.extend(alt_accounts)
|
||||
return accounts
|
||||
|
||||
def printHandles(accounts):
|
||||
"""returns string with all accounts in a readable way.
|
||||
|
||||
Args:
|
||||
accounts (list): list of str with handles
|
||||
|
||||
Returns:
|
||||
str: containing text that can be written to txtfile
|
||||
"""
|
||||
txt = ["Accounts to be scraped:\n"]
|
||||
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||
txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
|
||||
if i % 5 == 4:
|
||||
txt.append(" \n")
|
||||
txt.append(f"\n{i} accounts in total.")
|
||||
return ''.join(txt)
|
||||
|
||||
def scrapeUsers(handle, userDFColumns, maxTweets=1):
|
||||
currentTime = datetime.now()
|
||||
userList = []
|
||||
print(f'{currentTime:<30} Fetching: {handle:>15}')
|
||||
query = f'from:{handle}'
|
||||
|
||||
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
||||
if i > maxTweets:
|
||||
break
|
||||
# Get user data and append to singleUserList
|
||||
userList = []
|
||||
for col in userDFColumns:
|
||||
singleUser = eval(f'tweet.user.{col}')
|
||||
userList.append(singleUser)
|
||||
|
||||
# Create dataframe using userList and userDFColumns
|
||||
#df = pd.DataFrame(userList, columns=userDFColumns)
|
||||
return userList
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
|
||||
2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
|
||||
3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
|
||||
4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
|
||||
5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
|
||||
6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
|
||||
2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
|
||||
3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
|
||||
4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
|
||||
5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
|
||||
6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
|
||||
2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
|
||||
3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
|
||||
4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
|
||||
5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
|
||||
6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
|
||||
2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
|
||||
3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
|
||||
4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
|
||||
5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
|
||||
6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
|
|
135
preTestClassification.py
Normal file
135
preTestClassification.py
Normal file
@ -0,0 +1,135 @@
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
#%%
|
||||
# prepare
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "Tweets-Stub.csv"
|
||||
|
||||
# Name of pretest files
|
||||
preTestIDsFake = "pretest-tweets_fake.txt"
|
||||
preTestIDsNot = "pretest-tweets_not_fake.txt"
|
||||
|
||||
# Name of pretest datafile
|
||||
senCSVPretest = "Pretest.csv"
|
||||
senCSVPretestPrep = "Pretest-Prep.csv"
|
||||
senCSVPretestResult = "Pretest-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc
|
||||
senCSVcPretestPath = wd + ud + senCSVPretest
|
||||
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
|
||||
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
||||
preTestIDsFakePath = wd + di + preTestIDsFake
|
||||
preTestIDsNotPath = wd + di + preTestIDsNot
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# List of IDs to select
|
||||
# Read the IDs from a file
|
||||
preTestIDsFakeL = []
|
||||
preTestIDsNotL = []
|
||||
with open(preTestIDsFakePath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsFakeL.append(tid)
|
||||
with open(preTestIDsNotPath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsNotL.append(tid)
|
||||
|
||||
# Select rows based on the IDs
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
#%%
|
||||
# Create pretest dataframe
|
||||
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
|
||||
dfPreTest['fake'] = True
|
||||
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
|
||||
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
|
||||
|
||||
# %%
|
||||
results = pipe(KeyDataset(dataset, "text"))
|
||||
# %%
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfPreTest['output_label'] = output_labels
|
||||
dfPreTest['output_score'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
55
profiler.py
Normal file
55
profiler.py
Normal file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Aug 8 14:49:02 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import pandas_profiling as pp
|
||||
import numpy
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
|
||||
# forming dataframe and printing
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
# forming ProfileReport and save
|
||||
# as output.html file
|
||||
profileAll = pp.ProfileReport(df, minimal=True)
|
||||
profileAll.to_file("data/OUT/profiles/AllTweets.html")
|
||||
|
||||
df = pd.read_csv(senCSVcCovPath, dtype=(object))
|
||||
|
||||
profileAll = pp.ProfileReport(df, minimal=True)
|
||||
profileAll.to_file("data/OUT/profiles/CovTweets.html")
|
35
repairmystupidity.py
Normal file
35
repairmystupidity.py
Normal file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Aug 14 20:47:22 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
|
||||
richtig = wd + ud + "SenatorsTweets-Training.csv"
|
||||
correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
|
||||
|
||||
# don't change this one
|
||||
falsch = pd.read_csv(falsch, dtype=(object), sep=";")
|
||||
richtig = pd.read_csv(richtig, dtype=(object))
|
||||
|
||||
df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
|
||||
df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
|
||||
df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
|
||||
df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
|
||||
df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
|
||||
|
||||
df.to_csv(correct, encoding='utf-8', sep=";")
|
613
trainFake.py
Normal file
613
trainFake.py
Normal file
@ -0,0 +1,613 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Aug 12 12:25:18 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
#from datasets import load_dataset
|
||||
#from transformers import Trainer
|
||||
#from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Training CSV dataset
|
||||
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
|
||||
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
|
||||
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
|
||||
statsTrainingTopicClass = "statsTopicClassification-"
|
||||
|
||||
# don't change this one
|
||||
twtCSVPath = wd + ud + twtCSV + ".csv"
|
||||
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
|
||||
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
|
||||
|
||||
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
|
||||
|
||||
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
|
||||
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
|
||||
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
|
||||
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
|
||||
|
||||
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
|
||||
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
|
||||
|
||||
seed = 12355
|
||||
|
||||
# Model paths
|
||||
modCovClassPath = wd + "models/CovClass/"
|
||||
modFakeClassPath = wd + "models/FakeClass/"
|
||||
|
||||
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
|
||||
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
|
||||
#model_name = "cardiffnlp/tweet-topic-latest-multi"
|
||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
|
||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||
|
||||
# More models for fake detection:
|
||||
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
max_length = 64 # max token sentence length
|
||||
|
||||
#%%
|
||||
# Create training and testing dataset
|
||||
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
|
||||
|
||||
#dfTest = dfTest[:-900] # remove last 800 rows
|
||||
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
|
||||
|
||||
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
|
||||
|
||||
dfTest.drop(columns=['rawContent'], inplace=True)
|
||||
|
||||
# Only keep tweets that are longer than 3 words
|
||||
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
|
||||
dfTest['tweet_proc_length'].value_counts()
|
||||
dfTest = dfTest[dfTest['tweet_proc_length']>3]
|
||||
dfTest = dfTest.drop_duplicates(subset=['text'])
|
||||
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
|
||||
|
||||
# Create datasets for each classification
|
||||
dfCovClass = dfTest
|
||||
dfFakeClass = dfTest
|
||||
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
|
||||
|
||||
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
|
||||
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
|
||||
|
||||
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
|
||||
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
|
||||
|
||||
#%%
|
||||
# Tokenize tweets
|
||||
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
|
||||
dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
|
||||
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
|
||||
def encode_labels(label):
|
||||
if label == 'Covid':
|
||||
return 1
|
||||
elif label == 'NonCovid':
|
||||
return 0
|
||||
elif label == 'False':
|
||||
return 1
|
||||
elif label == 'True':
|
||||
return 0
|
||||
return 0
|
||||
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
|
||||
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
|
||||
#dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
|
||||
|
||||
# get n of classes
|
||||
print("# of Non-Covid tweets (coded 0):")
|
||||
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
|
||||
|
||||
print("# of Fake-news tweets (coded 1):")
|
||||
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
|
||||
# create disproportionate sample - 50/50 of both
|
||||
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
|
||||
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
|
||||
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
|
||||
|
||||
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
|
||||
dfCovClassab.reset_index(inplace=True)
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
|
||||
'''
|
||||
|
||||
# create training and validation samples
|
||||
dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
|
||||
|
||||
# reset index and drop unnecessary columns
|
||||
dfFakeClass_train.reset_index(drop=True, inplace=True)
|
||||
dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
dfFakeClass_test.reset_index(drop=True, inplace=True)
|
||||
dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
# save dfs as csvs and tsvs, for training and validation
|
||||
# covid classification datafiles
|
||||
# rows 0-41 = noncovid, 42-81 covid, therfore:
|
||||
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
|
||||
#dfCovClass.reset_index(inplace=True, drop=True)
|
||||
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
# fake news classification datafiles
|
||||
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
#%%
|
||||
# Prepare trainer
|
||||
#from transformers import TrainingArguments
|
||||
|
||||
#training_args = TrainingArguments(
|
||||
# report_to = 'wandb',
|
||||
# output_dir=wd+'results', # output directory/
|
||||
# overwrite_output_dir = True,
|
||||
# num_train_epochs=6, # total number of training epochs
|
||||
# per_device_train_batch_size=8, # batch size per device during training
|
||||
# per_device_eval_batch_size=16, # batch size for evaluation
|
||||
# learning_rate=2e-5,
|
||||
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
|
||||
# weight_decay=0.01, # strength of weight decay
|
||||
# logging_dir='./logs3', # directory for storing logs
|
||||
# logging_steps=1000,
|
||||
# evaluation_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
# load_best_model_at_end=True
|
||||
#)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
from transformers import BertForSequenceClassification, AdamW#, BertConfig
|
||||
#from torch.utils.data import TensorDataset, random_split
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
"""
|
||||
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
|
||||
train_dataset = train_dataset['train']
|
||||
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
|
||||
eval_dataset = eval_dataset['test']
|
||||
"""
|
||||
batch_size = 1
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
class PandasDataset(Dataset):
|
||||
def __init__(self, dataframe, tokenizer, max_length):
|
||||
self.dataframe = dataframe
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataframe)
|
||||
|
||||
def __getitem__(self, index):
|
||||
row = self.dataframe.iloc[index]
|
||||
text = row['text']
|
||||
labels = row['labels_encoded']
|
||||
|
||||
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
|
||||
input_ids = torch.tensor(encoded['input_ids'])
|
||||
attention_mask = torch.tensor(encoded['attention_mask'])
|
||||
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'labels': torch.tensor(labels) # Assuming labels are already encoded
|
||||
}
|
||||
|
||||
|
||||
train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=RandomSampler(train_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
|
||||
validation_dataloader = DataLoader(
|
||||
eval_dataset,
|
||||
sampler=SequentialSampler(eval_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
for idx, batch in enumerate(train_dataloader):
|
||||
print('Batch index: ', idx)
|
||||
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
|
||||
print('Batch label: ', batch['labels']) # Access 'labels' field
|
||||
break
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
num_labels = 2, # The number of output labels--2 for binary classification.
|
||||
# You can increase this for multi-class tasks.
|
||||
output_attentions = False, # Whether the model returns attentions weights.
|
||||
output_hidden_states = False, # Whether the model returns all hidden-states.
|
||||
)
|
||||
|
||||
#trainer = Trainer(
|
||||
# model=model, # the instantiated 🤗 Transformers model to be trained
|
||||
# args=training_args, # training arguments, defined above
|
||||
# train_dataset=train_dataset, # training dataset
|
||||
# eval_dataset=eval_dataset # evaluation dataset
|
||||
#)
|
||||
|
||||
|
||||
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
|
||||
# I believe the 'W' stands for 'Weight Decay fix"
|
||||
optimizer = AdamW(model.parameters(),
|
||||
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
|
||||
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
|
||||
)
|
||||
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
|
||||
# Number of training epochs. The BERT authors recommend between 2 and 4.
|
||||
# We chose to run for 6
|
||||
epochs = 6
|
||||
|
||||
# Total number of training steps is [number of batches] x [number of epochs].
|
||||
# (Note that this is not the same as the number of training samples).
|
||||
total_steps = len(train_dataloader) * epochs
|
||||
|
||||
# Create the learning rate scheduler.
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer,
|
||||
num_warmup_steps = 0, # Default value in run_glue.py
|
||||
num_training_steps = total_steps)
|
||||
|
||||
# Function to calculate the accuracy of our predictions vs labels
|
||||
def flat_accuracy(preds, labels):
|
||||
pred_flat = np.argmax(preds, axis=1).flatten()
|
||||
labels_flat = labels.flatten()
|
||||
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
||||
|
||||
import time
|
||||
import datetime
|
||||
|
||||
def format_time(elapsed):
|
||||
'''
|
||||
Takes a time in seconds and returns a string hh:mm:ss
|
||||
'''
|
||||
# Round to the nearest second.
|
||||
elapsed_rounded = int(round((elapsed)))
|
||||
|
||||
# Format as hh:mm:ss
|
||||
return str(datetime.timedelta(seconds=elapsed_rounded))
|
||||
|
||||
import random
|
||||
|
||||
# This training code is based on the `run_glue.py` script here:
|
||||
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
|
||||
|
||||
# Set the seed value all over the place to make this reproducible.
|
||||
seed_val = 12355
|
||||
|
||||
# If there's a GPU available...
|
||||
if torch.cuda.is_available():
|
||||
|
||||
# Tell PyTorch to use the GPU.
|
||||
device = torch.device("cuda")
|
||||
|
||||
print('There are %d GPU(s) available.' % torch.cuda.device_count())
|
||||
|
||||
print('We will use the GPU:', torch.cuda.get_device_name(0))
|
||||
#model.cuda()
|
||||
# If not...
|
||||
else:
|
||||
print('No GPU available, using the CPU instead.')
|
||||
device = torch.device("cpu")
|
||||
|
||||
device = torch.device("cpu")
|
||||
|
||||
random.seed(seed_val)
|
||||
np.random.seed(seed_val)
|
||||
torch.manual_seed(seed_val)
|
||||
torch.cuda.manual_seed_all(seed_val)
|
||||
|
||||
#%%
|
||||
# Start training
|
||||
# We'll store a number of quantities such as training and validation loss,
|
||||
# validation accuracy, and timings.
|
||||
training_stats = []
|
||||
|
||||
# Measure the total training time for the whole run.
|
||||
total_t0 = time.time()
|
||||
|
||||
# For each epoch...
|
||||
for epoch_i in range(0, epochs):
|
||||
# ========================================
|
||||
# Training
|
||||
# ========================================
|
||||
|
||||
# Perform one full pass over the training set.
|
||||
|
||||
print("")
|
||||
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
|
||||
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
|
||||
print('Training...')
|
||||
|
||||
# Measure how long the training epoch takes.
|
||||
t0 = time.time()
|
||||
model.to(device)
|
||||
# Reset the total loss for this epoch.
|
||||
total_train_loss = 0
|
||||
# Put the model into training mode. Don't be mislead--the call to
|
||||
# `train` just changes the *mode*, it doesn't *perform* the training.
|
||||
# `dropout` and `batchnorm` layers behave differently during training
|
||||
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
|
||||
model.train()
|
||||
|
||||
# For each batch of training data...
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
|
||||
# Progress update every 10 batches.
|
||||
if step % 10 == 0 and not step == 0:
|
||||
# Calculate elapsed time in minutes.
|
||||
elapsed = format_time(time.time() - t0)
|
||||
|
||||
# Report progress.
|
||||
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using the
|
||||
# `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
print("Batch keys:", batch.keys())
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Always clear any previously calculated gradients before performing a
|
||||
# backward pass. PyTorch doesn't do this automatically because
|
||||
# accumulating the gradients is "convenient while training RNNs".
|
||||
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
|
||||
model.zero_grad()
|
||||
|
||||
# Perform a forward pass (evaluate the model on this training batch).
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# It returns different numbers of parameters depending on what arguments
|
||||
# arge given and what flags are set. For our useage here, it returns
|
||||
# the loss (because we provided labels) and the "logits"--the model
|
||||
# outputs prior to activation.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the training loss over all of the batches so that we can
|
||||
# calculate the average loss at the end. `loss` is a Tensor containing a
|
||||
# single value; the `.item()` function just returns the Python value
|
||||
# from the tensor.
|
||||
total_train_loss += loss.item()
|
||||
|
||||
# Perform a backward pass to calculate the gradients.
|
||||
loss.backward()
|
||||
|
||||
# Clip the norm of the gradients to 1.0.
|
||||
# This is to help prevent the "exploding gradients" problem.
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||
|
||||
# Update parameters and take a step using the computed gradient.
|
||||
# The optimizer dictates the "update rule"--how the parameters are
|
||||
# modified based on their gradients, the learning rate, etc.
|
||||
optimizer.step()
|
||||
|
||||
# Update the learning rate.
|
||||
scheduler.step()
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_train_loss = total_train_loss / len(train_dataloader)
|
||||
|
||||
# Measure how long this epoch took.
|
||||
training_time = format_time(time.time() - t0)
|
||||
|
||||
print("")
|
||||
print(" Average training loss: {0:.2f}".format(avg_train_loss))
|
||||
print(" Training epcoh took: {:}".format(training_time))
|
||||
|
||||
# ========================================
|
||||
# Validation
|
||||
# ========================================
|
||||
# After the completion of each training epoch, measure our performance on
|
||||
# our validation set.
|
||||
|
||||
print("")
|
||||
print("Running Validation...")
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Put the model in evaluation mode--the dropout layers behave differently
|
||||
# during evaluation.
|
||||
model.eval()
|
||||
|
||||
# Tracking variables
|
||||
total_eval_accuracy = 0
|
||||
total_eval_loss = 0
|
||||
nb_eval_steps = 0
|
||||
|
||||
# Evaluate data for one epoch
|
||||
for batch in validation_dataloader:
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using
|
||||
# the `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Tell pytorch not to bother with constructing the compute graph during
|
||||
# the forward pass, since this is only needed for backprop (training).
|
||||
with torch.no_grad():
|
||||
|
||||
# Forward pass, calculate logit predictions.
|
||||
# token_type_ids is the same as the "segment ids", which
|
||||
# differentiates sentence 1 and 2 in 2-sentence tasks.
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# Get the "logits" output by the model. The "logits" are the output
|
||||
# values prior to applying an activation function like the softmax.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the validation loss.
|
||||
total_eval_loss += loss.item()
|
||||
|
||||
# Move logits and labels to CPU
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = b_labels.to('cpu').numpy()
|
||||
|
||||
# Calculate the accuracy for this batch of test sentences, and
|
||||
# accumulate it over all batches.
|
||||
total_eval_accuracy += flat_accuracy(logits, label_ids)
|
||||
|
||||
|
||||
# Report the final accuracy for this validation run.
|
||||
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
|
||||
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_val_loss = total_eval_loss / len(validation_dataloader)
|
||||
|
||||
# Measure how long the validation run took.
|
||||
validation_time = format_time(time.time() - t0)
|
||||
|
||||
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
|
||||
print(" Validation took: {:}".format(validation_time))
|
||||
|
||||
# Record all statistics from this epoch.
|
||||
training_stats.append(
|
||||
{
|
||||
'epoch': epoch_i + 1,
|
||||
'Training Loss': avg_train_loss,
|
||||
'Valid. Loss': avg_val_loss,
|
||||
'Valid. Accur.': avg_val_accuracy,
|
||||
'Training Time': training_time,
|
||||
'Validation Time': validation_time
|
||||
}
|
||||
)
|
||||
|
||||
print("")
|
||||
print("Training complete!")
|
||||
|
||||
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
|
||||
|
||||
params = list(model.named_parameters())
|
||||
|
||||
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
|
||||
|
||||
print('==== Embedding Layer ====\n')
|
||||
|
||||
for p in params[0:5]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== First Transformer ====\n')
|
||||
|
||||
for p in params[5:21]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== Output Layer ====\n')
|
||||
|
||||
for p in params[-4:]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
from datetime import datetime as dt
|
||||
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
now = dt.now().strftime(fTimeFormat)
|
||||
|
||||
output_dir = modFakeClassPath + now + "/"
|
||||
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
print("Saving model to %s" % output_dir)
|
||||
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Display floats with two decimal places.
|
||||
pd.set_option('display.precision', 2)
|
||||
|
||||
# Create a DataFrame from our training statistics.
|
||||
df_stats = pd.DataFrame(data=training_stats)
|
||||
|
||||
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
|
||||
df_stats = df_stats.set_index('epoch')
|
||||
|
||||
# A hack to force the column headers to wrap.
|
||||
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
|
||||
|
||||
|
||||
# Display the table.
|
||||
df_stats
|
||||
df_stats.to_csv(output_dir + now + ".csv")
|
607
trainTopic.py
Normal file
607
trainTopic.py
Normal file
@ -0,0 +1,607 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Aug 12 12:25:18 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
#from datasets import load_dataset
|
||||
#from transformers import Trainer
|
||||
#from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Training CSV dataset
|
||||
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
|
||||
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
|
||||
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
|
||||
statsTrainingTopicClass = "statsTopicClassification-"
|
||||
|
||||
# don't change this one
|
||||
twtCSVPath = wd + ud + twtCSV + ".csv"
|
||||
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
|
||||
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
|
||||
|
||||
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
|
||||
|
||||
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
|
||||
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
|
||||
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
|
||||
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
|
||||
|
||||
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
|
||||
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
|
||||
|
||||
seed = 12355
|
||||
|
||||
# Model paths
|
||||
modCovClassPath = wd + "models/CovClass/"
|
||||
modFakeClassPath = wd + "models/FakeClass/"
|
||||
|
||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||
|
||||
# More models for fake detection:
|
||||
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
max_length = 64 # max token sentence length
|
||||
|
||||
#%%
|
||||
# Create training and testing dataset
|
||||
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
|
||||
|
||||
#dfTest = dfTest[:-900] # remove last 800 rows
|
||||
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
|
||||
|
||||
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
|
||||
|
||||
dfTest.drop(columns=['rawContent'], inplace=True)
|
||||
|
||||
# Only keep tweets that are longer than 3 words
|
||||
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
|
||||
dfTest['tweet_proc_length'].value_counts()
|
||||
dfTest = dfTest[dfTest['tweet_proc_length']>3]
|
||||
dfTest = dfTest.drop_duplicates(subset=['text'])
|
||||
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
|
||||
|
||||
# Create datasets for each classification
|
||||
dfCovClass = dfTest
|
||||
dfFakeClass = dfTest
|
||||
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
|
||||
|
||||
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
|
||||
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
|
||||
|
||||
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
|
||||
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
|
||||
|
||||
#%%
|
||||
# Tokenize tweets
|
||||
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
|
||||
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
|
||||
def encode_labels(label):
|
||||
if label == 'Covid':
|
||||
return 1
|
||||
elif label == 'NonCovid':
|
||||
return 0
|
||||
elif label == 'Fake':
|
||||
return 1
|
||||
elif label == 'True':
|
||||
return 0
|
||||
return 0
|
||||
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
|
||||
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
|
||||
|
||||
# get n of classes
|
||||
print("# of Non-Covid tweets (coded 0):")
|
||||
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
|
||||
|
||||
print("# of Fake-news tweets (coded 1):")
|
||||
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
|
||||
# create disproportionate sample - 50/50 of both
|
||||
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
|
||||
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
|
||||
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
|
||||
|
||||
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
|
||||
dfCovClassab.reset_index(inplace=True)
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
|
||||
'''
|
||||
|
||||
# create training and validation samples
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
|
||||
|
||||
# reset index and drop unnecessary columns
|
||||
dfCovClass_train.reset_index(drop=True, inplace=True)
|
||||
dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
dfCovClass_test.reset_index(drop=True, inplace=True)
|
||||
dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
# save dfs as csvs and tsvs, for training and validation
|
||||
# covid classification datafiles
|
||||
# rows 0-41 = noncovid, 42-81 covid, therfore:
|
||||
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
|
||||
#dfCovClass.reset_index(inplace=True, drop=True)
|
||||
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
# fake news classification datafiles
|
||||
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
#%%
|
||||
# Prepare trainer
|
||||
#from transformers import TrainingArguments
|
||||
|
||||
#training_args = TrainingArguments(
|
||||
# report_to = 'wandb',
|
||||
# output_dir=wd+'results', # output directory/
|
||||
# overwrite_output_dir = True,
|
||||
# num_train_epochs=6, # total number of training epochs
|
||||
# per_device_train_batch_size=8, # batch size per device during training
|
||||
# per_device_eval_batch_size=16, # batch size for evaluation
|
||||
# learning_rate=2e-5,
|
||||
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
|
||||
# weight_decay=0.01, # strength of weight decay
|
||||
# logging_dir='./logs3', # directory for storing logs
|
||||
# logging_steps=1000,
|
||||
# evaluation_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
# load_best_model_at_end=True
|
||||
#)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
from transformers import BertForSequenceClassification, AdamW#, BertConfig
|
||||
#from torch.utils.data import TensorDataset, random_split
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
"""
|
||||
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
|
||||
train_dataset = train_dataset['train']
|
||||
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
|
||||
eval_dataset = eval_dataset['test']
|
||||
"""
|
||||
batch_size = 1
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
class PandasDataset(Dataset):
|
||||
def __init__(self, dataframe, tokenizer, max_length):
|
||||
self.dataframe = dataframe
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataframe)
|
||||
|
||||
def __getitem__(self, index):
|
||||
row = self.dataframe.iloc[index]
|
||||
text = row['text']
|
||||
labels = row['labels_encoded']
|
||||
|
||||
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
|
||||
input_ids = torch.tensor(encoded['input_ids'])
|
||||
attention_mask = torch.tensor(encoded['attention_mask'])
|
||||
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'labels': torch.tensor(labels) # Assuming labels are already encoded
|
||||
}
|
||||
|
||||
|
||||
train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=RandomSampler(train_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
|
||||
validation_dataloader = DataLoader(
|
||||
eval_dataset,
|
||||
sampler=SequentialSampler(eval_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
for idx, batch in enumerate(train_dataloader):
|
||||
print('Batch index: ', idx)
|
||||
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
|
||||
print('Batch label: ', batch['labels']) # Access 'labels' field
|
||||
break
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
num_labels = 2, # The number of output labels--2 for binary classification.
|
||||
# You can increase this for multi-class tasks.
|
||||
output_attentions = False, # Whether the model returns attentions weights.
|
||||
output_hidden_states = False, # Whether the model returns all hidden-states.
|
||||
)
|
||||
|
||||
#trainer = Trainer(
|
||||
# model=model, # the instantiated 🤗 Transformers model to be trained
|
||||
# args=training_args, # training arguments, defined above
|
||||
# train_dataset=train_dataset, # training dataset
|
||||
# eval_dataset=eval_dataset # evaluation dataset
|
||||
#)
|
||||
|
||||
|
||||
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
|
||||
# I believe the 'W' stands for 'Weight Decay fix"
|
||||
optimizer = AdamW(model.parameters(),
|
||||
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
|
||||
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
|
||||
)
|
||||
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
|
||||
# Number of training epochs. The BERT authors recommend between 2 and 4.
|
||||
# We chose to run for 6
|
||||
epochs = 6
|
||||
|
||||
# Total number of training steps is [number of batches] x [number of epochs].
|
||||
# (Note that this is not the same as the number of training samples).
|
||||
total_steps = len(train_dataloader) * epochs
|
||||
|
||||
# Create the learning rate scheduler.
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer,
|
||||
num_warmup_steps = 0, # Default value in run_glue.py
|
||||
num_training_steps = total_steps)
|
||||
|
||||
# Function to calculate the accuracy of our predictions vs labels
|
||||
def flat_accuracy(preds, labels):
|
||||
pred_flat = np.argmax(preds, axis=1).flatten()
|
||||
labels_flat = labels.flatten()
|
||||
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
||||
|
||||
import time
|
||||
import datetime
|
||||
|
||||
def format_time(elapsed):
|
||||
'''
|
||||
Takes a time in seconds and returns a string hh:mm:ss
|
||||
'''
|
||||
# Round to the nearest second.
|
||||
elapsed_rounded = int(round((elapsed)))
|
||||
|
||||
# Format as hh:mm:ss
|
||||
return str(datetime.timedelta(seconds=elapsed_rounded))
|
||||
|
||||
import random
|
||||
|
||||
# This training code is based on the `run_glue.py` script here:
|
||||
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
|
||||
|
||||
# Set the seed value all over the place to make this reproducible.
|
||||
seed_val = 12355
|
||||
|
||||
# If there's a GPU available...
|
||||
if torch.cuda.is_available():
|
||||
|
||||
# Tell PyTorch to use the GPU.
|
||||
device = torch.device("cuda")
|
||||
|
||||
print('There are %d GPU(s) available.' % torch.cuda.device_count())
|
||||
|
||||
print('We will use the GPU:', torch.cuda.get_device_name(0))
|
||||
#model.cuda()
|
||||
# If not...
|
||||
else:
|
||||
print('No GPU available, using the CPU instead.')
|
||||
device = torch.device("cpu")
|
||||
|
||||
device = torch.device("cpu")
|
||||
|
||||
random.seed(seed_val)
|
||||
np.random.seed(seed_val)
|
||||
torch.manual_seed(seed_val)
|
||||
torch.cuda.manual_seed_all(seed_val)
|
||||
|
||||
#%%
|
||||
# Start training
|
||||
# We'll store a number of quantities such as training and validation loss,
|
||||
# validation accuracy, and timings.
|
||||
training_stats = []
|
||||
|
||||
# Measure the total training time for the whole run.
|
||||
total_t0 = time.time()
|
||||
|
||||
# For each epoch...
|
||||
for epoch_i in range(0, epochs):
|
||||
# ========================================
|
||||
# Training
|
||||
# ========================================
|
||||
|
||||
# Perform one full pass over the training set.
|
||||
|
||||
print("")
|
||||
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
|
||||
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
|
||||
print('Training...')
|
||||
|
||||
# Measure how long the training epoch takes.
|
||||
t0 = time.time()
|
||||
model.to(device)
|
||||
# Reset the total loss for this epoch.
|
||||
total_train_loss = 0
|
||||
# Put the model into training mode. Don't be mislead--the call to
|
||||
# `train` just changes the *mode*, it doesn't *perform* the training.
|
||||
# `dropout` and `batchnorm` layers behave differently during training
|
||||
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
|
||||
model.train()
|
||||
|
||||
# For each batch of training data...
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
|
||||
# Progress update every 10 batches.
|
||||
if step % 10 == 0 and not step == 0:
|
||||
# Calculate elapsed time in minutes.
|
||||
elapsed = format_time(time.time() - t0)
|
||||
|
||||
# Report progress.
|
||||
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using the
|
||||
# `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
print("Batch keys:", batch.keys())
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Always clear any previously calculated gradients before performing a
|
||||
# backward pass. PyTorch doesn't do this automatically because
|
||||
# accumulating the gradients is "convenient while training RNNs".
|
||||
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
|
||||
model.zero_grad()
|
||||
|
||||
# Perform a forward pass (evaluate the model on this training batch).
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# It returns different numbers of parameters depending on what arguments
|
||||
# arge given and what flags are set. For our useage here, it returns
|
||||
# the loss (because we provided labels) and the "logits"--the model
|
||||
# outputs prior to activation.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the training loss over all of the batches so that we can
|
||||
# calculate the average loss at the end. `loss` is a Tensor containing a
|
||||
# single value; the `.item()` function just returns the Python value
|
||||
# from the tensor.
|
||||
total_train_loss += loss.item()
|
||||
|
||||
# Perform a backward pass to calculate the gradients.
|
||||
loss.backward()
|
||||
|
||||
# Clip the norm of the gradients to 1.0.
|
||||
# This is to help prevent the "exploding gradients" problem.
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||
|
||||
# Update parameters and take a step using the computed gradient.
|
||||
# The optimizer dictates the "update rule"--how the parameters are
|
||||
# modified based on their gradients, the learning rate, etc.
|
||||
optimizer.step()
|
||||
|
||||
# Update the learning rate.
|
||||
scheduler.step()
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_train_loss = total_train_loss / len(train_dataloader)
|
||||
|
||||
# Measure how long this epoch took.
|
||||
training_time = format_time(time.time() - t0)
|
||||
|
||||
print("")
|
||||
print(" Average training loss: {0:.2f}".format(avg_train_loss))
|
||||
print(" Training epcoh took: {:}".format(training_time))
|
||||
|
||||
# ========================================
|
||||
# Validation
|
||||
# ========================================
|
||||
# After the completion of each training epoch, measure our performance on
|
||||
# our validation set.
|
||||
|
||||
print("")
|
||||
print("Running Validation...")
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Put the model in evaluation mode--the dropout layers behave differently
|
||||
# during evaluation.
|
||||
model.eval()
|
||||
|
||||
# Tracking variables
|
||||
total_eval_accuracy = 0
|
||||
total_eval_loss = 0
|
||||
nb_eval_steps = 0
|
||||
|
||||
# Evaluate data for one epoch
|
||||
for batch in validation_dataloader:
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using
|
||||
# the `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Tell pytorch not to bother with constructing the compute graph during
|
||||
# the forward pass, since this is only needed for backprop (training).
|
||||
with torch.no_grad():
|
||||
|
||||
# Forward pass, calculate logit predictions.
|
||||
# token_type_ids is the same as the "segment ids", which
|
||||
# differentiates sentence 1 and 2 in 2-sentence tasks.
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# Get the "logits" output by the model. The "logits" are the output
|
||||
# values prior to applying an activation function like the softmax.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the validation loss.
|
||||
total_eval_loss += loss.item()
|
||||
|
||||
# Move logits and labels to CPU
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = b_labels.to('cpu').numpy()
|
||||
|
||||
# Calculate the accuracy for this batch of test sentences, and
|
||||
# accumulate it over all batches.
|
||||
total_eval_accuracy += flat_accuracy(logits, label_ids)
|
||||
|
||||
|
||||
# Report the final accuracy for this validation run.
|
||||
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
|
||||
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_val_loss = total_eval_loss / len(validation_dataloader)
|
||||
|
||||
# Measure how long the validation run took.
|
||||
validation_time = format_time(time.time() - t0)
|
||||
|
||||
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
|
||||
print(" Validation took: {:}".format(validation_time))
|
||||
|
||||
# Record all statistics from this epoch.
|
||||
training_stats.append(
|
||||
{
|
||||
'epoch': epoch_i + 1,
|
||||
'Training Loss': avg_train_loss,
|
||||
'Valid. Loss': avg_val_loss,
|
||||
'Valid. Accur.': avg_val_accuracy,
|
||||
'Training Time': training_time,
|
||||
'Validation Time': validation_time
|
||||
}
|
||||
)
|
||||
|
||||
print("")
|
||||
print("Training complete!")
|
||||
|
||||
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
|
||||
|
||||
params = list(model.named_parameters())
|
||||
|
||||
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
|
||||
|
||||
print('==== Embedding Layer ====\n')
|
||||
|
||||
for p in params[0:5]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== First Transformer ====\n')
|
||||
|
||||
for p in params[5:21]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== Output Layer ====\n')
|
||||
|
||||
for p in params[-4:]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
from datetime import datetime as dt
|
||||
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
now = dt.now().strftime(fTimeFormat)
|
||||
|
||||
output_dir = modCovClassPath + now + "/"
|
||||
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
print("Saving model to %s" % output_dir)
|
||||
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Display floats with two decimal places.
|
||||
pd.set_option('display.precision', 2)
|
||||
|
||||
# Create a DataFrame from our training statistics.
|
||||
df_stats = pd.DataFrame(data=training_stats)
|
||||
|
||||
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
|
||||
df_stats = df_stats.set_index('epoch')
|
||||
|
||||
# A hack to force the column headers to wrap.
|
||||
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
|
||||
|
||||
|
||||
# Display the table.
|
||||
df_stats
|
||||
df_stats.to_csv(output_dir + now + ".csv")
|
Loading…
x
Reference in New Issue
Block a user