Compare commits
40 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
89b4755c65 | ||
![]() |
01e58b1b99 | ||
![]() |
d0fcefedf4 | ||
![]() |
71cf907249 | ||
![]() |
a9018fedee | ||
![]() |
d94a93295f | ||
![]() |
80b63b39df | ||
![]() |
d8136909c8 | ||
![]() |
1c6d9d5415 | ||
![]() |
4e08cde317 | ||
![]() |
2535683cdc | ||
![]() |
8f744a08be | ||
![]() |
df5fd51a5f | ||
![]() |
3d4f559d2d | ||
![]() |
2e067b6a64 | ||
![]() |
7a16526a97 | ||
![]() |
b89b5969ec | ||
![]() |
7c6b618272 | ||
![]() |
90aa58239c | ||
![]() |
1beff96ae9 | ||
![]() |
881d3d6d6d | ||
![]() |
5a63c478e9 | ||
![]() |
ed61d52182 | ||
![]() |
a26d150060 | ||
![]() |
d791e4a293 | ||
![]() |
d57b7a31b7 | ||
![]() |
13d80124d3 | ||
![]() |
3de6d8f3ec | ||
![]() |
899a99ba72 | ||
![]() |
817ec48478 | ||
![]() |
c64904a64d | ||
![]() |
82830f13e2 | ||
![]() |
8c8a191952 | ||
![]() |
71e10a62d3 | ||
![]() |
90d5501ec8 | ||
![]() |
340cca017c | ||
![]() |
791cebc297 | ||
![]() |
6241484e83 | ||
![]() |
d73da8db98 | ||
![]() |
6220c1841d |
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,7 +1,9 @@
|
||||
**/log*
|
||||
**/*.log
|
||||
**/*lock*
|
||||
**/*-slice*.csv
|
||||
**/*.zip
|
||||
**/*.html
|
||||
**/*.htm
|
||||
/ALL-SENATORS-LONG.csv
|
||||
/ALL-SENATORS.csv
|
||||
/collect2.py
|
||||
|
1
.vscode/.gitignore
vendored
Normal file
1
.vscode/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/settings.json
|
123
ClassificationFake.py
Normal file
123
ClassificationFake.py
Normal file
@ -0,0 +1,123 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
|
||||
#%%
|
||||
# prepare & define paths
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "Tweets-Classified-Topic-Results.csv"
|
||||
|
||||
# Name of Classify datafile
|
||||
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
||||
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
|
||||
#%%
|
||||
# get datafra,e
|
||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||
def encode_labels(label):
|
||||
if label == 'True':
|
||||
return 'False'
|
||||
elif label == 'False':
|
||||
return 'True'
|
||||
return 0
|
||||
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
|
||||
|
||||
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
|
||||
|
||||
# dataframe from csv
|
||||
dfClassify['fake'] = False
|
||||
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
|
||||
#%%
|
||||
# remove empty rows
|
||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||
|
||||
# %%from datetime import datetime
|
||||
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfClassify['output_label_fake'] = output_labels
|
||||
dfClassify['output_score_fake'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
123
ClassificationTopic.py
Normal file
123
ClassificationTopic.py
Normal file
@ -0,0 +1,123 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
|
||||
#%%
|
||||
# prepare & define paths
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
||||
|
||||
# Name of Classify datafile
|
||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
|
||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
|
||||
#%%
|
||||
# get datafra,e
|
||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
# dataframe from csv
|
||||
dfClassify['fake'] = False
|
||||
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
#%%
|
||||
# remove empty rows
|
||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||
|
||||
# %%from datetime import datetime
|
||||
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfClassify['output_label_topicCov'] = output_labels
|
||||
dfClassify['output_score_topicCov'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
||||
## corrections
|
||||
def encode_labels(label):
|
||||
if label == 'real':
|
||||
return 'True'
|
||||
elif label == 'fake':
|
||||
return 'False'
|
||||
return 0
|
||||
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
#still wrong, will be corrected in ClassificationFake.py
|
||||
|
131
README.md
Normal file
131
README.md
Normal file
@ -0,0 +1,131 @@
|
||||
# Requirements
|
||||
|
||||
- python 3.10+
|
||||
- snscrape 0.6.2.20230321+ (see git repo in this folder)
|
||||
- transformers 4.31.0
|
||||
- numpy 1.23.5
|
||||
- pandas 2.0.3
|
||||
- scikit-learn 1.3.0
|
||||
- torch 2.0.1
|
||||
|
||||
# About
|
||||
|
||||
This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
|
||||
Training only works with a prepared dataset in which the tweets are pre classified.
|
||||
More info in the comments of the scripts.
|
||||
Due to time constraints, most of the code is procedurally coded and ugly but effective.
|
||||
|
||||
# How to
|
||||
|
||||
Tested on Ubuntu 22.04.
|
||||
If needed, the virual environment can be exported and send to you.
|
||||
|
||||
All files in the folder data/in have to exist in order to execute the scripts.
|
||||
Execute in the following order:
|
||||
|
||||
01 collect.py (see more for further info on scraping)
|
||||
02 collectSenData.py
|
||||
03 cleanTweets
|
||||
04 preTestClassification.py
|
||||
05 trainTopic.py
|
||||
06 trainFake.py
|
||||
07 ClassificationFake.py
|
||||
08 ClassificationTopic.py
|
||||
|
||||
# Files & Folders
|
||||
|
||||
Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
|
||||
|
||||
```
|
||||
├── data
|
||||
│ ├── IN
|
||||
│ │ ├── counterKeywordsFinal.txt
|
||||
│ │ ├── counterKeywords.txt
|
||||
│ │ ├── keywords-raw.txt
|
||||
│ │ ├── keywords.txt
|
||||
│ │ ├── own_keywords.txt
|
||||
│ │ ├── pretest-tweets_fake.txt contains tweet ids for pretest
|
||||
│ │ ├── pretest-tweets_not_fake.txt contains tweet ids for pretest
|
||||
│ │ └── senators-raw.csv senator datafile
|
||||
│ ├── OUT
|
||||
│ │ ├── ALL-SENATORS-TWEETS.csv
|
||||
│ │ ├── graphs
|
||||
│ │ │ ├── Timeline.png
|
||||
│ │ │ ├── Wordcloud-All.png
|
||||
│ │ │ └── Wordcloud-Cov.png
|
||||
│ │ ├── Pretest-Prep.csv
|
||||
│ │ ├── Pretest-Results.csv
|
||||
│ │ ├── Pretest-SENATORS-TWEETS.csv
|
||||
│ │ ├── profiles dataset profiles
|
||||
│ │ │ ├── AllTweets.html
|
||||
│ │ │ └── CovTweets.html
|
||||
│ │ ├── SenatorsTweets-Final.csv
|
||||
│ │ ├── SenatorsTweets-OnlyCov.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassification.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassificationTRAIN.csv
|
||||
│ │ ├── SenatorsTweets-train-CovClassification.tsv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassification.csv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
|
||||
│ │ ├── SenatorsTweets-train-FakeClassification.tsv
|
||||
│ │ ├── SenatorsTweets-Training.csv
|
||||
│ │ ├── SenatorsTweets-Training_WORKING-COPY.csv
|
||||
│ │ ├── topClass-PRETEST-Prep.csv
|
||||
│ │ ├── topClass-PRETEST-Results.csv
|
||||
│ │ ├── Tweets-All-slices.zip
|
||||
│ │ ├── Tweets-Classified-Fake-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Fake-Results.csv
|
||||
│ │ ├── Tweets-Classified-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Topic-Prep.csv
|
||||
│ │ ├── Tweets-Classified-Topic-Results.csv
|
||||
│ │ └── Tweets-Stub.csv
|
||||
├── funs
|
||||
│ ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing
|
||||
│ ├── ClearDupes.py function for deletion of duplicate keywords
|
||||
│ ├── __init__.py
|
||||
│ ├── Scrape.py scraper functions to be used for multiprocessing
|
||||
│ └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
|
||||
├── log logs of the scraping process
|
||||
│ ├── log_2023-06-23_21-06-10_err.log
|
||||
│ ├── log_2023-06-23_21-06-10.log
|
||||
│ └── log_2023-06-23_21-06-10_missing.log
|
||||
├── models
|
||||
│ ├── CovClass Covid tweet classification model
|
||||
│ │ └── 2023-08-15_05-56-50
|
||||
│ │ ├── 2023-08-15_05-56-50.csv training output
|
||||
│ │ ├── config.json
|
||||
│ │ ├── pytorch_model.bin
|
||||
│ │ ├── special_tokens_map.json
|
||||
│ │ ├── tokenizer_config.json
|
||||
│ │ ├── tokenizer.json
|
||||
│ │ └── vocab.txt
|
||||
│ └── FakeClass Fake tweet classification model
|
||||
│ └── 2023-08-15_14-35-43
|
||||
│ ├── 2023-08-15_14-35-43.csv training output
|
||||
│ ├── config.json
|
||||
│ ├── pytorch_model.bin
|
||||
│ ├── special_tokens_map.json
|
||||
│ ├── tokenizer_config.json
|
||||
│ ├── tokenizer.json
|
||||
│ └── vocab.txt
|
||||
├── snscrape contains snscrape 0.6.2.20230321+ git repo
|
||||
├── ClassificationFake.py classifies tweets as fake or non-fake, saves:
|
||||
│ Tweets-Classified-Fake-Prep.csv - prepared training dataset
|
||||
│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results
|
||||
├── ClassificationTopic.py classifies tweet topic, saves:
|
||||
│ Tweets-Classified-Topic-Prep.csv - prepared training dataset
|
||||
│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results
|
||||
├── cleanTweets.py Curates keywordlists
|
||||
│ Merges senator and tweet datasets
|
||||
│ Creates multiple datasets:
|
||||
│ SenatorsTweets-Final.csv - all tweets with keyword columns
|
||||
│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist
|
||||
│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
|
||||
├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
|
||||
├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv
|
||||
├── createGraphs.py creates wordcloud & timeline graphs
|
||||
├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
|
||||
├── profiler.py creates dataset profiles
|
||||
├── README.md readme
|
||||
├── trainFake.py training script for the fake tweet classification model
|
||||
└── trainTopic.py training script for the tweet topic classification model
|
||||
```
|
233
cleanTweets.py
Normal file
233
cleanTweets.py
Normal file
@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 26 20:36:43 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
# import pyreadstat
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
# Seet for training dataset generation
|
||||
seed = 86431891
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
senCSVcTrain = "SenatorsTweets-Training"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
## Import own functions
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
from ClearDupes import deDupe
|
||||
|
||||
mixed_columns = df.columns[df.nunique() != len(df)]
|
||||
print(mixed_columns)
|
||||
|
||||
df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
|
||||
del df[df.columns[0]] # remove first col
|
||||
|
||||
df['user.created'] = pd.to_datetime(df['user.created'])
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
#%%
|
||||
# sort and generate id
|
||||
df = df.sort_values(by='date').reset_index() # sort df by date before generating id
|
||||
df["tid"] = df.index + 1 # create id column
|
||||
|
||||
#%%
|
||||
# move id column to front
|
||||
cols = list(df.columns.values) # Make a list of all of the columns in the df
|
||||
cols.pop(cols.index('tid')) # Remove id from list
|
||||
#cols.pop(cols.index('user')) # Remove id from list
|
||||
df = df[['tid']+cols] # Create new dataframe with ordered colums
|
||||
|
||||
#%%
|
||||
###################
|
||||
# Keywords
|
||||
# read additional keywords from a file and write to list.
|
||||
keywords = []
|
||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
|
||||
# Read the keywords from a file
|
||||
with open(f"{di}own_keywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
# write all keywords to file
|
||||
with open(f"{di}keywords-raw.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
|
||||
# delete keywords ppe and china that lead to too many false positives
|
||||
removeWords = {'ppe', 'china'}
|
||||
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
||||
|
||||
with open(f"{di}keywords.txt", "w") as file:
|
||||
print("read keyword files")
|
||||
for line in keywords:
|
||||
file.write(f'{line}\n')
|
||||
|
||||
# counter keywords
|
||||
# Read the keywords from a file
|
||||
counterKeywords = []
|
||||
with open(f"{di}counterKeywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
counterKeyword = line.strip() # Remove the newline character
|
||||
counterKeywords.append(counterKeyword)
|
||||
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
|
||||
print("read keyword files")
|
||||
for line in counterKeywords:
|
||||
file.write(f'{line}\n')
|
||||
|
||||
#%%
|
||||
# overwrite keyword column
|
||||
df['keywords'] = np.nan
|
||||
df['keywords'] = (
|
||||
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||
)
|
||||
df['counterKeywords'] = np.nan
|
||||
df['counterKeywords'] = (
|
||||
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||
)
|
||||
#%%
|
||||
# create boolean contains_keyword column
|
||||
df['contains_keyword'] = True
|
||||
df['contains_counterKeyword'] = True
|
||||
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||
|
||||
#%%
|
||||
pd.Series(df["user.id"]).is_unique
|
||||
|
||||
#%%
|
||||
# Merge Datasets
|
||||
# get senator data
|
||||
cols = [
|
||||
"name",
|
||||
"id",
|
||||
"state_short",
|
||||
"party",
|
||||
"class",
|
||||
"ideology",
|
||||
"start_serving",
|
||||
"end_serving",
|
||||
"time_in_office",
|
||||
"not_in_office",
|
||||
"last_congress",
|
||||
"vote_share",
|
||||
"next_closest_share",
|
||||
"election_year",
|
||||
"twitter_handle",
|
||||
"alt_handle",
|
||||
"date_of_birth",
|
||||
"female",
|
||||
"ethnicity",
|
||||
"edu_level",
|
||||
"edu_information",
|
||||
"occup_level"]
|
||||
|
||||
dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
|
||||
dfSenA['alt'] = False
|
||||
dfSenB['alt'] = True
|
||||
|
||||
dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
|
||||
dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
|
||||
dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
|
||||
|
||||
dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
|
||||
dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
|
||||
df['user.username'] = df['user.username'].apply(str.lower)
|
||||
|
||||
dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
|
||||
|
||||
# %%
|
||||
# see if all senators are present in file
|
||||
dfAll = df.merge(dfSenAll, how='left',on='user.username')
|
||||
#check merge
|
||||
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
||||
print(unique_usernames)
|
||||
# senatorisakson was dropped, is ok
|
||||
#%%
|
||||
# create covidtweets csv
|
||||
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
||||
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||
|
||||
#%%
|
||||
# create column with tweet length
|
||||
|
||||
dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
|
||||
|
||||
# reset df index and write to id column
|
||||
dfCov.reset_index(drop=True, inplace=True)
|
||||
|
||||
#%%
|
||||
# Export to csv, sav and dta
|
||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||
dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
|
||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||
# =============================================================================
|
||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||
# dfAllStata = dfAll.rename(columns={'class':'class_'})
|
||||
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
|
||||
# print(dfAllStata.columns)
|
||||
# ====================================================df.id.str.len().value_counts()
|
||||
# =========================
|
||||
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = pd.dfCov(np.random.rand(1800))
|
||||
# %%
|
||||
# Create training dataset
|
||||
np.random.seed(seed);
|
||||
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
|
||||
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
|
||||
dfTrain['topicCovid'] = True
|
||||
dfTrain['fake'] = False
|
||||
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
|
960
collect.ipynb
960
collect.ipynb
@ -1,960 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "15573d92-f6a7-49d4-9c01-fff33d23be8e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tweet Collecting\n",
|
||||
"## Requirements\n",
|
||||
"- tweepy-4.14.0\n",
|
||||
"- pandas-2.0\n",
|
||||
"- numpy-1.24.3\n",
|
||||
"\n",
|
||||
"## Preparations & Config\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "3290c840-961c-4e2c-a107-4ccd541d151b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import tweepy\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import glob\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"# Define time period of interest\n",
|
||||
"time_slices = [\n",
|
||||
" {\n",
|
||||
" \"start_time\": \"2020-01-01T00:00:00Z\",\n",
|
||||
" \"end_time\": \"2020-06-01T00:00:00Z\",\n",
|
||||
" \"suffix\": \"-slice1\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"start_time\": \"2020-06-01T00:00:01Z\",\n",
|
||||
" \"end_time\": \"2021-01-01T00:00:00Z\",\n",
|
||||
" \"suffix\": \"-slice2\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"start_time\": \"2021-01-01T00:00:01Z\",\n",
|
||||
" \"end_time\": \"2021-06-01T00:00:00Z\",\n",
|
||||
" \"suffix\": \"-slice3\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"start_time\": \"2021-06-01T00:00:01Z\",\n",
|
||||
" \"end_time\": \"2023-01-03T00:00:00Z\",\n",
|
||||
" \"suffix\": \"-slice4\"\n",
|
||||
" }\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"tweet_fields = [\n",
|
||||
"\t\"id\",\n",
|
||||
"\t\"text\",\n",
|
||||
"\t\"attachments\",\n",
|
||||
"\t\"author_id\",\n",
|
||||
"\t\"context_annotations\",\n",
|
||||
"\t\"conversation_id\",\n",
|
||||
"\t\"created_at\",\n",
|
||||
"\t\"entities\",\n",
|
||||
"\t\"geo\",\n",
|
||||
"\t\"lang\",\n",
|
||||
"\t\"possibly_sensitive\",\n",
|
||||
"\t\"public_metrics\",\n",
|
||||
"\t\"referenced_tweets\",\n",
|
||||
"\t\"reply_settings\",\n",
|
||||
"\t\"source\",\n",
|
||||
"\t\"withheld\",\n",
|
||||
"\t]\n",
|
||||
"\n",
|
||||
"## Setup directories\n",
|
||||
"# WD Michael\n",
|
||||
"# wd = \"/home/michael/Documents/PS/Data/collectTweets/\"\n",
|
||||
"\n",
|
||||
"# WD Server\n",
|
||||
"wd = \"/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection\"\n",
|
||||
"\n",
|
||||
"# WD Josie\n",
|
||||
"# wd = \"/home/michael/Documents/PS/Data/\"\n",
|
||||
"\n",
|
||||
"# WD Sam\n",
|
||||
"# wd = \"/home/michael/Documents/PS/Data/\"\n",
|
||||
"\n",
|
||||
"# Tweet-datafile directory\n",
|
||||
"td = \"data/tweets/\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6782290c-7e14-4393-8caa-c78a2b326d85",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Authenticate to Twitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "7ac9b603-e638-4ebb-95df-e0f8678f298e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Setup Api-connection\n",
|
||||
"bearer_token = \"AAAAAAAAAAAAAAAAAAAAAMVDlQEAAAAAal9f5uZrM12CVPA4f4jr4mGH5Oc%3DuTg1Vd0YKYMwraA7ibX6LiGyd337OXkm3JwudEX7vatruswmoc\"\n",
|
||||
"client = tweepy.Client(bearer_token, return_type = dict, wait_on_rate_limit = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e81c4d49-242c-4b51-8e2a-e2bbfdae6877",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Import Keywords\n",
|
||||
"Keywords from:\n",
|
||||
"* Chen, E., Lerman, K., & Ferrara, E. (2020). Tracking Social Media Discourse About the COVID-19 Pandemic: Development of a Public Coronavirus Twitter Data Set. JMIR Public Health and Surveillance, 6(2), e19273. https://doi.org/10.2196/19273\n",
|
||||
"Line 80 and following:\n",
|
||||
"* Lamsal, R. (2020). Coronavirus (COVID-19) Tweets Dataset [Data set]. IEEE. https://ieee-dataport.org/open-access/coronavirus-covid-19-tweets-dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "1d4af102-30ae-4c73-ae9c-333efb34e3f1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Coronavirus',\n",
|
||||
" 'Koronavirus',\n",
|
||||
" 'Corona',\n",
|
||||
" 'CDC',\n",
|
||||
" 'Wuhancoronavirus',\n",
|
||||
" 'Wuhanlockdown',\n",
|
||||
" 'Ncov',\n",
|
||||
" 'Wuhan',\n",
|
||||
" 'N95',\n",
|
||||
" 'Kungflu',\n",
|
||||
" 'Epidemic',\n",
|
||||
" 'outbreak',\n",
|
||||
" 'Sinophobia',\n",
|
||||
" 'China',\n",
|
||||
" 'covid-19',\n",
|
||||
" 'corona virus',\n",
|
||||
" 'covid',\n",
|
||||
" 'covid19',\n",
|
||||
" 'sars-cov-2',\n",
|
||||
" 'COVIDー19',\n",
|
||||
" 'COVD',\n",
|
||||
" 'pandemic',\n",
|
||||
" 'coronapocalypse',\n",
|
||||
" 'canceleverything',\n",
|
||||
" 'Coronials',\n",
|
||||
" 'SocialDistancingNow',\n",
|
||||
" 'Social Distancing',\n",
|
||||
" 'SocialDistancing',\n",
|
||||
" 'panicbuy',\n",
|
||||
" 'panic buy',\n",
|
||||
" 'panicbuying',\n",
|
||||
" 'panic buying',\n",
|
||||
" '14DayQuarantine',\n",
|
||||
" 'DuringMy14DayQuarantine',\n",
|
||||
" 'panic shop',\n",
|
||||
" 'panic shopping',\n",
|
||||
" 'panicshop',\n",
|
||||
" 'InMyQuarantineSurvivalKit',\n",
|
||||
" 'panic-buy',\n",
|
||||
" 'panic-shop',\n",
|
||||
" 'coronakindness',\n",
|
||||
" 'quarantinelife',\n",
|
||||
" 'chinese virus',\n",
|
||||
" 'chinesevirus',\n",
|
||||
" 'stayhomechallenge',\n",
|
||||
" 'stay home challenge',\n",
|
||||
" 'sflockdown',\n",
|
||||
" 'DontBeASpreader',\n",
|
||||
" 'lockdown',\n",
|
||||
" 'lock down',\n",
|
||||
" 'shelteringinplace',\n",
|
||||
" 'sheltering in place',\n",
|
||||
" 'staysafestayhome',\n",
|
||||
" 'stay safe stay home',\n",
|
||||
" 'trumppandemic',\n",
|
||||
" 'trump pandemic',\n",
|
||||
" 'flattenthecurve',\n",
|
||||
" 'flatten the curve',\n",
|
||||
" 'china virus',\n",
|
||||
" 'chinavirus',\n",
|
||||
" 'quarentinelife',\n",
|
||||
" 'PPEshortage',\n",
|
||||
" 'saferathome',\n",
|
||||
" 'stayathome',\n",
|
||||
" 'stay at home',\n",
|
||||
" 'stay home',\n",
|
||||
" 'stayhome',\n",
|
||||
" 'GetMePPE',\n",
|
||||
" 'covidiot',\n",
|
||||
" 'epitwitter',\n",
|
||||
" 'pandemie',\n",
|
||||
" 'wear a mask',\n",
|
||||
" 'wearamask',\n",
|
||||
" 'kung flu',\n",
|
||||
" 'covididiot',\n",
|
||||
" 'COVID__19',\n",
|
||||
" 'omicron',\n",
|
||||
" 'variant',\n",
|
||||
" 'vaccine',\n",
|
||||
" 'travel ban',\n",
|
||||
" 'corona',\n",
|
||||
" 'corona',\n",
|
||||
" 'coronavirus',\n",
|
||||
" 'coronavirus',\n",
|
||||
" 'covid',\n",
|
||||
" 'covid',\n",
|
||||
" 'covid19',\n",
|
||||
" 'covid19',\n",
|
||||
" 'covid-19',\n",
|
||||
" 'covid-19',\n",
|
||||
" 'sarscov2',\n",
|
||||
" 'sarscov2',\n",
|
||||
" 'sars cov2',\n",
|
||||
" 'sars cov 2',\n",
|
||||
" 'covid_19',\n",
|
||||
" 'covid_19',\n",
|
||||
" 'ncov',\n",
|
||||
" 'ncov',\n",
|
||||
" 'ncov2019',\n",
|
||||
" 'ncov2019',\n",
|
||||
" '2019-ncov',\n",
|
||||
" '2019-ncov',\n",
|
||||
" 'pandemic',\n",
|
||||
" 'pandemic 2019ncov',\n",
|
||||
" '2019ncov',\n",
|
||||
" 'quarantine',\n",
|
||||
" 'quarantine',\n",
|
||||
" 'flatten the curve',\n",
|
||||
" 'flattening the curve',\n",
|
||||
" 'flatteningthecurve',\n",
|
||||
" 'flattenthecurve',\n",
|
||||
" 'hand sanitizer',\n",
|
||||
" 'handsanitizer',\n",
|
||||
" 'lockdown',\n",
|
||||
" 'lockdown',\n",
|
||||
" 'social distancing',\n",
|
||||
" 'socialdistancing',\n",
|
||||
" 'work from home',\n",
|
||||
" 'workfromhome',\n",
|
||||
" 'working from home',\n",
|
||||
" 'workingfromhome',\n",
|
||||
" 'ppe',\n",
|
||||
" 'n95',\n",
|
||||
" 'ppe',\n",
|
||||
" 'n95',\n",
|
||||
" 'covidiots',\n",
|
||||
" 'covidiots',\n",
|
||||
" 'herd immunity',\n",
|
||||
" 'herdimmunity',\n",
|
||||
" 'pneumonia',\n",
|
||||
" 'pneumonia',\n",
|
||||
" 'chinese virus',\n",
|
||||
" 'chinesevirus',\n",
|
||||
" 'wuhan virus',\n",
|
||||
" 'wuhanvirus',\n",
|
||||
" 'kung flu',\n",
|
||||
" 'kungflu',\n",
|
||||
" 'wearamask',\n",
|
||||
" 'wearamask',\n",
|
||||
" 'wear a mask',\n",
|
||||
" 'vaccine',\n",
|
||||
" 'vaccines',\n",
|
||||
" 'vaccine',\n",
|
||||
" 'vaccines',\n",
|
||||
" 'corona vaccine',\n",
|
||||
" 'corona vaccines',\n",
|
||||
" 'coronavaccine',\n",
|
||||
" 'coronavaccines',\n",
|
||||
" 'face shield',\n",
|
||||
" 'faceshield',\n",
|
||||
" 'face shields',\n",
|
||||
" 'faceshields',\n",
|
||||
" 'health worker',\n",
|
||||
" 'healthworker',\n",
|
||||
" 'health workers',\n",
|
||||
" 'healthworkers',\n",
|
||||
" 'stayhomestaysafe',\n",
|
||||
" 'coronaupdate',\n",
|
||||
" 'frontlineheroes',\n",
|
||||
" 'coronawarriors',\n",
|
||||
" 'homeschool',\n",
|
||||
" 'homeschooling',\n",
|
||||
" 'hometasking',\n",
|
||||
" 'masks4all',\n",
|
||||
" 'wfh',\n",
|
||||
" 'wash ur hands',\n",
|
||||
" 'wash your hands',\n",
|
||||
" 'washurhands',\n",
|
||||
" 'washyourhands',\n",
|
||||
" 'stayathome',\n",
|
||||
" 'stayhome',\n",
|
||||
" 'selfisolating',\n",
|
||||
" 'self isolating']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"keywords = []\n",
|
||||
"\n",
|
||||
"# Read the keywords from a file\n",
|
||||
"with open(\"data/keywords.txt\", \"r\") as file:\n",
|
||||
" lines = file.readlines()\n",
|
||||
" for line in lines:\n",
|
||||
" keyword = line.strip() # Remove the newline character\n",
|
||||
" keywords.append(keyword)\n",
|
||||
"\n",
|
||||
"keywords"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f190608-c0a2-4e7e-9560-a03a57aa4132",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Import Accounts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a5bde33c-cc69-43ad-9b0c-4b04ce7f8a3c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['SenAlexander', 'SenatorEnzi', 'CoryGardner', 'VP', 'SenatorIsakson', 'DougJones', 'KLoeffler', 'MarthaMcSallyAZ', 'DavidPerdueGA', 'SenPatRoberts', 'SenatorTomUdall', 'SenatorBaldwin', 'SenJohnBarrasso', 'SenatorBennet', 'MarshaBlackburn', 'SenBlumenthal', 'RoyBlunt', 'senbooker', 'JohnBoozman', 'SenatorBraun', 'SenSherrodBrown', 'SenatorBurr', 'SenatorCantwell', 'SenCapito', 'SenatorCardin', 'SenatorCarper', 'SenBobCasey', 'SenBillCassidy', 'SenatorCollins', 'ChrisCoons', 'JohnCornyn', 'SenCortezMasto', 'SenTomCotton', 'SenKevinCramer', 'MikeCrapo', 'SenTedCruz', 'SteveDaines', 'SenDuckworth', 'SenatorDurbin', 'SenJoniErnst', 'SenFettermanPA', 'SenFeinstein', 'SenatorFischer', 'SenGillibrand', 'LindseyGrahamSC', 'ChuckGrassley', 'SenatorHagerty', 'SenatorHassan', 'HawleyMO', 'MartinHeinrich', 'SenatorHick', 'maziehirono', 'SenJohnHoeven', 'SenHydeSmith', 'JimInhofe', 'SenRonJohnson', 'timkaine', 'SenMarkKelly', 'SenJohnKennedy', 'SenAngusKing', 'SenAmyKlobuchar', 'SenatorLankford', 'SenatorLeahy', 'SenMikeLee', 'SenatorLujan', 'SenLummis', 'Sen_JoeManchin', 'SenMarkey', 'SenatorMarshall', 'LeaderMcConnell', 'SenatorMenendez', 'SenJeffMerkley', 'JerryMoran', 'lisamurkowski', 'ChrisMurphyCT', 'PattyMurray', 'SenOssoff', 'SenAlexPadilla', 'senrandpaul', 'SenGaryPeters', 'senrobportman', 'SenJackReed', 'SenatorRisch', 'SenatorRomney', 'SenJackyRosen', 'SenatorRounds', 'senmarcorubio', 'SenSanders', 'sensasse', 'brianschatz', 'SenSchumer', 'SenRickScott', 'SenatorTimScott', 'SenatorShaheen', 'SenShelby', 'SenatorSinema', 'SenTinaSmith', 'SenStabenow', 'SenDanSullivan', 'SenatorTester', 'SenJohnThune', 'SenThomTillis', 'SenToomey', 'SenTuberville', 'ChrisVanHollen', 'MarkWarner', 'SenatorWarnock', 'ewarren', 'SenWhitehouse', 'SenatorWicker', 'RonWyden', 'SenToddYoung']\n",
|
||||
"['LamarAlexander ', nan, 'corygardner', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Get accounts & alt-accounts from Senators-Datafile\n",
|
||||
"accounts = pd.read_csv(\"data/senators-raw.csv\")[\"twitter_handle\"].tolist()\n",
|
||||
"alt_accounts = pd.read_csv(\"data/senators-raw.csv\")[\"alt_handle\"].tolist()\n",
|
||||
"print(accounts)\n",
|
||||
"print(alt_accounts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "befc0fad-c803-4145-a041-570d6f894178",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Collect Tweets\n",
|
||||
"Loops over accounts:\n",
|
||||
"* Collects Tweets of account. \n",
|
||||
"* Then extracts columns public_metrics (likes aso) and referenced_tweets (indicates, whether tweet is a reply).\n",
|
||||
"* Checks if tweet-text contains any of the keywords, if so, inserts the keyword(s) in a new column.\n",
|
||||
"* Saves tweets of the account in a csv file \"HANDLE.csv\" and \"HANDLE-LONG.csv\" (LONG contains all given information such as annotations, that we might or might not need)\n",
|
||||
"\n",
|
||||
"### Problem:\n",
|
||||
"_I limited the results to 20 tweets per senator._\n",
|
||||
"Twitter has the following API Limit for the [search_all_tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all) method I used: \n",
|
||||
"* App rate limit (Application-only): 300 requests per 15-minute window shared among all users of your app\n",
|
||||
"* App rate limit (Application-only): 1 per second shared among all users of your app\n",
|
||||
"\n",
|
||||
"With a limit of 300, I request 20 posts per slice, just to get a better understanding of what's happening. After trying different things out, I think that the time-slices won't be needed if we get around the problem I'm having right now:\n",
|
||||
"as soon, as the rate limit is reached, tweepy stops and waits for the time to run out and start again. BUT it doesn't retry the request but starts with the next request. \n",
|
||||
"I haven't found anything and my only idea to solve the problem was to generate a list of failed attempts (via try and except) and after getting all tweets letting tweepy work over that list again. \n",
|
||||
"One more thing I don't understand is that, when fetching the tweets I already sent to you, I didn't have as many problems as now and the limit exceeded after 3-4 senators, even though I used a higher `max_result` and a higher `flatten value`.\n",
|
||||
"\n",
|
||||
"I hope that the following output speaks for itself:\n",
|
||||
"```\n",
|
||||
"trying to fetch tweets for SenAlexander-slice1\n",
|
||||
"trying to fetch tweets for SenAlexander-slice2\n",
|
||||
"trying to fetch tweets for SenAlexander-slice3\n",
|
||||
"trying to fetch tweets for SenAlexander-slice4\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice1\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice2\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice3\n",
|
||||
"return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice4\n",
|
||||
"\n",
|
||||
"Rate limit exceeded. Sleeping for 893 seconds.\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Tweepy returned no tweets because of the exceeded tweet limit, then the script tried to fetch more tweets and the error message came up.\n",
|
||||
"Before changing the code below, see the other version i wrote just below the next cell (and ignore the error message below the cell as i just interrupted the execution which lead to the error message)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "0f842b8a-846a-4f38-8231-c1e9ccfbddf5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"trying to fetch tweets for SenAlexander-slice1\n",
|
||||
"trying to fetch tweets for SenAlexander-slice2\n",
|
||||
"trying to fetch tweets for SenAlexander-slice3\n",
|
||||
"trying to fetch tweets for SenAlexander-slice4\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice1\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice2\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice3\n",
|
||||
"return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"<generator object Paginator.flatten at 0x7f20ebf137b0>\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice4\n",
|
||||
"trying to fetch tweets for CoryGardner-slice1\n",
|
||||
"trying to fetch tweets for CoryGardner-slice2\n",
|
||||
"trying to fetch tweets for CoryGardner-slice3\n",
|
||||
"return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"<generator object Paginator.flatten at 0x7f20ebf13740>\n",
|
||||
"trying to fetch tweets for CoryGardner-slice4\n",
|
||||
"trying to fetch tweets for VP-slice1\n",
|
||||
"trying to fetch tweets for VP-slice2\n",
|
||||
"trying to fetch tweets for VP-slice3\n",
|
||||
"trying to fetch tweets for VP-slice4\n",
|
||||
"trying to fetch tweets for SenatorIsakson-slice1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[28], line 30\u001b[0m\n\u001b[1;32m 22\u001b[0m tweets \u001b[38;5;241m=\u001b[39m tweepy\u001b[38;5;241m.\u001b[39mPaginator(client\u001b[38;5;241m.\u001b[39msearch_all_tweets,\n\u001b[1;32m 23\u001b[0m query\u001b[38;5;241m=\u001b[39mquery,\n\u001b[1;32m 24\u001b[0m tweet_fields\u001b[38;5;241m=\u001b[39mtweet_fields,\n\u001b[1;32m 25\u001b[0m start_time\u001b[38;5;241m=\u001b[39mstart_time,\n\u001b[1;32m 26\u001b[0m end_time\u001b[38;5;241m=\u001b[39mend_time,\n\u001b[1;32m 27\u001b[0m max_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m)\u001b[38;5;241m.\u001b[39mflatten(\u001b[38;5;241m20\u001b[39m)\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# for each tweet returned...\u001b[39;00m\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m tweet \u001b[38;5;129;01min\u001b[39;00m tweets:\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# ... add that tweet to tweetlist\u001b[39;00m\n\u001b[1;32m 32\u001b[0m tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:67\u001b[0m, in \u001b[0;36mPaginator.flatten\u001b[0;34m(self, limit)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 66\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m response \u001b[38;5;129;01min\u001b[39;00m PaginationIterator(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmethod, \u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs\n\u001b[1;32m 69\u001b[0m ):\n\u001b[1;32m 70\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n\u001b[1;32m 71\u001b[0m response_data \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;129;01mor\u001b[39;00m []\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:126\u001b[0m, in \u001b[0;36mPaginationIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpagination_token\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pagination_token\n\u001b[0;32m--> 126\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n\u001b[1;32m 129\u001b[0m meta \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mmeta\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:1163\u001b[0m, in \u001b[0;36mClient.search_all_tweets\u001b[0;34m(self, query, **params)\u001b[0m\n\u001b[1;32m 1071\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"search_all_tweets( \\\u001b[39;00m\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;124;03m query, *, end_time=None, expansions=None, max_results=None, \\\u001b[39;00m\n\u001b[1;32m 1073\u001b[0m \u001b[38;5;124;03m media_fields=None, next_token=None, place_fields=None, \\\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1160\u001b[0m \u001b[38;5;124;03m.. _pagination: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/paginate\u001b[39;00m\n\u001b[1;32m 1161\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1162\u001b[0m params[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m query\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1164\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/2/tweets/search/all\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1165\u001b[0m \u001b[43m \u001b[49m\u001b[43mendpoint_parameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1166\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mend_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpansions\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_results\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedia.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1167\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnext_token\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mplace.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpoll.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1168\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msince_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msort_order\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstart_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtweet.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muntil_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTweet\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:129\u001b[0m, in \u001b[0;36mBaseClient._make_request\u001b[0;34m(self, method, route, params, endpoint_parameters, json, data_type, user_auth)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28mself\u001b[39m, method, route, params\u001b[38;5;241m=\u001b[39m{}, endpoint_parameters\u001b[38;5;241m=\u001b[39m(), json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 125\u001b[0m data_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, user_auth\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 126\u001b[0m ):\n\u001b[1;32m 127\u001b[0m request_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_params(params, endpoint_parameters)\n\u001b[0;32m--> 129\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 130\u001b[0m \u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_auth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_auth\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_type \u001b[38;5;129;01mis\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mResponse:\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:84\u001b[0m, in \u001b[0;36mBaseClient.request\u001b[0;34m(self, method, route, params, json, user_auth)\u001b[0m\n\u001b[1;32m 75\u001b[0m headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAuthorization\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBearer \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbearer_token\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 77\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\n\u001b[1;32m 78\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMaking API request: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhost\u001b[38;5;250m \u001b[39m\u001b[38;5;241m+\u001b[39m\u001b[38;5;250m \u001b[39mroute\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParameters: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparams\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHeaders: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mheaders\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 81\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBody: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mjson\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 82\u001b[0m )\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhost\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauth\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m response:\n\u001b[1;32m 88\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\n\u001b[1;32m 89\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReceived API response: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mreason\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHeaders: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mheaders\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 92\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mcontent\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 93\u001b[0m )\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m400\u001b[39m:\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 483\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 501\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:790\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 787\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 789\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 790\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 802\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 803\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 805\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m 806\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:536\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 536\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/urllib3/connection.py:454\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mresponse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPResponse\n\u001b[1;32m 453\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 454\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 457\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/http/client.py:1347\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1345\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1346\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1347\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1348\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1349\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/http/client.py:307\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 307\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 309\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/http/client.py:268\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 268\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/socket.py:704\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 703\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 704\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 706\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/ssl.py:1241\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1237\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1239\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1241\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1242\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1243\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
|
||||
"File \u001b[0;32m/usr/lib/python3.9/ssl.py:1099\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1098\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1099\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1100\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Iterate over each Twitter account\n",
|
||||
"for handle in accounts:\n",
|
||||
" for slice_data in time_slices:\n",
|
||||
" # sleep 1 second to not get over 1sec api limit\n",
|
||||
" time.sleep(1) \n",
|
||||
" # define slice data variables from time_slices\n",
|
||||
" start_time = slice_data['start_time']\n",
|
||||
" end_time = slice_data['end_time']\n",
|
||||
" suffix = slice_data['suffix']\n",
|
||||
" \n",
|
||||
" # define tweepy query with twitter handle of current sen\n",
|
||||
" query = f'from:{handle} -is:retweet'\n",
|
||||
" \n",
|
||||
" # create empty tweetlist that will be filled with tweets of current sen\n",
|
||||
" tweetlist = []\n",
|
||||
" \n",
|
||||
" # statusmsg\n",
|
||||
" msg = f'trying to fetch tweets for {handle}{suffix}'\n",
|
||||
" print(msg)\n",
|
||||
" \n",
|
||||
" # Fetch tweets using tweepy Twitter API v2 pagination\n",
|
||||
" tweets = tweepy.Paginator(client.search_all_tweets,\n",
|
||||
" query=query,\n",
|
||||
" tweet_fields=tweet_fields,\n",
|
||||
" start_time=start_time,\n",
|
||||
" end_time=end_time,\n",
|
||||
" max_results=20).flatten(20)\n",
|
||||
" \n",
|
||||
" # for each tweet returned...\n",
|
||||
" for tweet in tweets:\n",
|
||||
" # ... add that tweet to tweetlist\n",
|
||||
" tweetlist.append(tweet)\n",
|
||||
" \n",
|
||||
" # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
|
||||
" if len(tweetlist) == 0:\n",
|
||||
" msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
|
||||
" print(msg)\n",
|
||||
" print(tweets)\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # convert to dataframe\n",
|
||||
" tweet_df = pd.DataFrame(tweetlist)\n",
|
||||
" \n",
|
||||
" # add handle column as api only provides user-ids\n",
|
||||
" tweet_df['handle'] = handle\n",
|
||||
" \n",
|
||||
" ## Extract referenced_tweet info from column\n",
|
||||
" tweet_df['referenced_tweet_type'] = None\n",
|
||||
" tweet_df['referenced_tweet_id'] = None\n",
|
||||
" \n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'referenced_tweets' in tweet_df.columns:\n",
|
||||
" for index, row in tweet_df.iterrows():\n",
|
||||
" referenced_tweets = row['referenced_tweets']\n",
|
||||
" \n",
|
||||
" if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
|
||||
" referenced_tweet = referenced_tweets[0]\n",
|
||||
" referenced_tweet_type = referenced_tweet['type']\n",
|
||||
" referenced_tweet_id = referenced_tweet['id']\n",
|
||||
" \n",
|
||||
" tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
|
||||
" tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
|
||||
" \n",
|
||||
" ## Check if tweet-text contains keyword\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'text' in tweet_df.columns:\n",
|
||||
" tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
|
||||
" .str.join(',')\n",
|
||||
" .replace('', 'none'))\n",
|
||||
" \n",
|
||||
" ## Save two versions of the dataset, one with all fields and one without dict fields\n",
|
||||
" # define filepaths\n",
|
||||
" csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
|
||||
" csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
|
||||
" # save LONG csv\n",
|
||||
" tweet_df.to_csv(csv_path2)\n",
|
||||
" # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
|
||||
" tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
|
||||
" # save short csv\n",
|
||||
" tweet_df.to_csv(csv_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb779d9a-cecb-475c-9e76-22c9b8c1928d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Alternative way to fetch tweets via tweepy with retry mechanism"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "c3b4a2ba-46e2-478b-9558-7d6999fdcd69",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"trying to fetch tweets for SenAlexander-slice1\n",
|
||||
"trying to fetch tweets for SenAlexander-slice2\n",
|
||||
"trying to fetch tweets for SenAlexander-slice3\n",
|
||||
"trying to fetch tweets for SenAlexander-slice4\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice1\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice2\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice3\n",
|
||||
"return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice4\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Rate limit exceeded. Sleeping for 437 seconds.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"trying to fetch tweets for CoryGardner-slice1\n",
|
||||
"trying to fetch tweets for CoryGardner-slice2\n",
|
||||
"trying to fetch tweets for CoryGardner-slice3\n",
|
||||
"return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"trying to fetch tweets for CoryGardner-slice4\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Rate limit exceeded. Sleeping for 897 seconds.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "AttributeError",
|
||||
"evalue": "module 'tweepy' has no attribute 'TweepError'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[21], line 33\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# for each tweet returned...\u001b[39;00m\n\u001b[0;32m---> 33\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m tweet \u001b[38;5;129;01min\u001b[39;00m tweets:\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# ... add that tweet to tweetlist\u001b[39;00m\n\u001b[1;32m 35\u001b[0m tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:67\u001b[0m, in \u001b[0;36mPaginator.flatten\u001b[0;34m(self, limit)\u001b[0m\n\u001b[1;32m 66\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 67\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m response \u001b[38;5;129;01min\u001b[39;00m PaginationIterator(\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmethod, \u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs\n\u001b[1;32m 69\u001b[0m ):\n\u001b[1;32m 70\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/pagination.py:126\u001b[0m, in \u001b[0;36mPaginationIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpagination_token\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m pagination_token\n\u001b[0;32m--> 126\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response, Response):\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:1163\u001b[0m, in \u001b[0;36mClient.search_all_tweets\u001b[0;34m(self, query, **params)\u001b[0m\n\u001b[1;32m 1162\u001b[0m params[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m query\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1164\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/2/tweets/search/all\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1165\u001b[0m \u001b[43m \u001b[49m\u001b[43mendpoint_parameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1166\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mend_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpansions\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_results\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedia.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1167\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnext_token\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mplace.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpoll.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mquery\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1168\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msince_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msort_order\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstart_time\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtweet.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1169\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muntil_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser.fields\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 1170\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mTweet\u001b[49m\n\u001b[1;32m 1171\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:129\u001b[0m, in \u001b[0;36mBaseClient._make_request\u001b[0;34m(self, method, route, params, endpoint_parameters, json, data_type, user_auth)\u001b[0m\n\u001b[1;32m 127\u001b[0m request_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_params(params, endpoint_parameters)\n\u001b[0;32m--> 129\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mroute\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 130\u001b[0m \u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_auth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_auth\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_type \u001b[38;5;129;01mis\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mResponse:\n",
|
||||
"File \u001b[0;32m/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/.venv/lib/python3.9/site-packages/tweepy/client.py:112\u001b[0m, in \u001b[0;36mBaseClient.request\u001b[0;34m(self, method, route, params, json, user_auth)\u001b[0m\n\u001b[1;32m 108\u001b[0m log\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRate limit exceeded. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 110\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSleeping for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msleep_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m seconds.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 111\u001b[0m )\n\u001b[0;32m--> 112\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43msleep_time\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(method, route, params, json, user_auth)\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
|
||||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[21], line 39\u001b[0m\n\u001b[1;32m 35\u001b[0m tweetlist\u001b[38;5;241m.\u001b[39mappend(tweet)\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m \u001b[38;5;66;03m# exit the retry loop if tweets are successfully fetched\u001b[39;00m\n\u001b[0;32m---> 39\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[43mtweepy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTweepError\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m# handle rate limit exceeded error\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m e\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m429\u001b[39m:\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# get the rate limit reset time from the response headers\u001b[39;00m\n\u001b[1;32m 43\u001b[0m reset_time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(e\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mheaders[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx-rate-limit-reset\u001b[39m\u001b[38;5;124m'\u001b[39m])\n",
|
||||
"\u001b[0;31mAttributeError\u001b[0m: module 'tweepy' has no attribute 'TweepError'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Iterate over each Twitter account\n",
|
||||
"for handle in accounts:\n",
|
||||
" for slice_data in time_slices:\n",
|
||||
" # define slice data variables from time_slices\n",
|
||||
" start_time = slice_data['start_time']\n",
|
||||
" end_time = slice_data['end_time']\n",
|
||||
" suffix = slice_data['suffix']\n",
|
||||
" \n",
|
||||
" # define tweepy query with twitter handle of current sen\n",
|
||||
" query = f'from:{handle} -is:retweet'\n",
|
||||
" \n",
|
||||
" # create empty tweetlist that will be filled with tweets of current sen\n",
|
||||
" tweetlist = []\n",
|
||||
" \n",
|
||||
" # statusmsg\n",
|
||||
" msg = f'trying to fetch tweets for {handle}{suffix}'\n",
|
||||
" print(msg)\n",
|
||||
" \n",
|
||||
" # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism\n",
|
||||
" max_attempts = 3 # maximum number of attempts to fetch tweets for a slice\n",
|
||||
" attempt = 1\n",
|
||||
" \n",
|
||||
" while attempt <= max_attempts:\n",
|
||||
" try:\n",
|
||||
" tweets = tweepy.Paginator(client.search_all_tweets,\n",
|
||||
" query=query,\n",
|
||||
" tweet_fields=tweet_fields,\n",
|
||||
" start_time=start_time,\n",
|
||||
" end_time=end_time,\n",
|
||||
" max_results=20).flatten(20)\n",
|
||||
" \n",
|
||||
" # for each tweet returned...\n",
|
||||
" for tweet in tweets:\n",
|
||||
" # ... add that tweet to tweetlist\n",
|
||||
" tweetlist.append(tweet)\n",
|
||||
" \n",
|
||||
" break # exit the retry loop if tweets are successfully fetched\n",
|
||||
" \n",
|
||||
" except tweepy.TweepError as e:\n",
|
||||
" # handle rate limit exceeded error\n",
|
||||
" if e.response.status_code == 429:\n",
|
||||
" # get the rate limit reset time from the response headers\n",
|
||||
" reset_time = int(e.response.headers['x-rate-limit-reset'])\n",
|
||||
" current_time = int(time.time())\n",
|
||||
" \n",
|
||||
" # calculate the sleep time until the rate limit resets\n",
|
||||
" sleep_time = reset_time - current_time + 1 # add an extra second\n",
|
||||
" \n",
|
||||
" # sleep until the rate limit resets\n",
|
||||
" time.sleep(sleep_time)\n",
|
||||
" \n",
|
||||
" attempt += 1 # increment the attempt counter\n",
|
||||
" continue # retry the API call\n",
|
||||
" \n",
|
||||
" else:\n",
|
||||
" # handle other types of Tweepy errors\n",
|
||||
" print(f'Error occurred: {e}')\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
|
||||
" if len(tweetlist) == 0:\n",
|
||||
" msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
|
||||
" print(msg)\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # convert to dataframe\n",
|
||||
" tweet_df = pd.DataFrame(tweetlist)\n",
|
||||
" \n",
|
||||
" # add handle column as api only provides user-ids\n",
|
||||
" tweet_df['handle'] = handle\n",
|
||||
" \n",
|
||||
" ## Extract referenced_tweet info from column\n",
|
||||
" tweet_df['referenced_tweet_type'] = None\n",
|
||||
" tweet_df['referenced_tweet_id'] = None\n",
|
||||
" \n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'referenced_tweets' in tweet_df.columns:\n",
|
||||
" for index, row in tweet_df.iterrows():\n",
|
||||
" referenced_tweets = row['referenced_tweets']\n",
|
||||
" \n",
|
||||
" if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
|
||||
" referenced_tweet = referenced_tweets[0]\n",
|
||||
" referenced_tweet_type = referenced_tweet['type']\n",
|
||||
" referenced_tweet_id = referenced_tweet['id']\n",
|
||||
" \n",
|
||||
" tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
|
||||
" tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
|
||||
" \n",
|
||||
" ## Check if tweet-text contains keyword\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'text' in tweet_df.columns:\n",
|
||||
" tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
|
||||
" .str.join(',')\n",
|
||||
" .replace('', 'none'))\n",
|
||||
" \n",
|
||||
" ## Save two versions of the dataset, one with all fields and one without dict fields\n",
|
||||
" # define filepaths\n",
|
||||
" csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
|
||||
" csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
|
||||
" # save LONG csv\n",
|
||||
" tweet_df.to_csv(csv_path2)\n",
|
||||
" # Remove 'context_annotations', 'entities' and 'referenced_tweets' columns for short csv files\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
|
||||
" tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
|
||||
" # save short csv\n",
|
||||
" tweet_df.to_csv(csv_path)\n",
|
||||
" \n",
|
||||
" # sleep 1 second to not exceed the API rate limit\n",
|
||||
" time.sleep(1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b5dd5498-1ba4-4f0a-9bb9-ffce4655212d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path_to_tweetdfs = wd + td\n",
|
||||
"os.chdir(path_to_tweetdfs)\n",
|
||||
"tweetfiles = glob.glob('*.{}'.format(\"csv\"))\n",
|
||||
"\n",
|
||||
"print(tweetfiles)\n",
|
||||
"\n",
|
||||
"# save merged csv as two files \n",
|
||||
"df_all_senators = pd.DataFrame()\n",
|
||||
"df_all_senators_long = pd.DataFrame()\n",
|
||||
"for file in tweetfiles:\n",
|
||||
"\tif \"LONG\" in file:\n",
|
||||
"\t\tdf = pd.read_csv(file)\n",
|
||||
"\t\tdf_all_senators_long = pd.concat([df, df_all_senators_long])\n",
|
||||
"\telse:\n",
|
||||
"\t\tdf = pd.read_csv(file)\n",
|
||||
"\t\tdf_all_senators = pd.concat([df, df_all_senators])\n",
|
||||
"csv_path = td + \"ALL-SENATORS.csv\"\n",
|
||||
"csv_path2 = td + \"ALL-SENATORS-LONG-LONG.csv\"\n",
|
||||
"df_all_senators.to_csv(csv_path) \n",
|
||||
"df_all_senators_long.to_csv(csv_path2)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "320ebbf4-8eaf-4189-836b-5d5aa8a0a263",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"trying to fetch tweets for SenAlexander-slice1\n",
|
||||
"trying to fetch tweets for SenAlexander-slice2\n",
|
||||
"trying to fetch tweets for SenAlexander-slice3\n",
|
||||
"trying to fetch tweets for SenAlexander-slice4\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice1\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice2\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice3\n",
|
||||
"return empty in SenatorEnzi-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"trying to fetch tweets for SenatorEnzi-slice4\n",
|
||||
"trying to fetch tweets for CoryGardner-slice1\n",
|
||||
"trying to fetch tweets for CoryGardner-slice2\n",
|
||||
"trying to fetch tweets for CoryGardner-slice3\n",
|
||||
"return empty in CoryGardner-slice3 - from 2021-01-01T00:00:01Z to 2021-06-01T00:00:00Z\n",
|
||||
"trying to fetch tweets for CoryGardner-slice4\n",
|
||||
"trying to fetch tweets for VP-slice1\n",
|
||||
"trying to fetch tweets for VP-slice2\n",
|
||||
"trying to fetch tweets for VP-slice3\n",
|
||||
"trying to fetch tweets for VP-slice4\n",
|
||||
"trying to fetch tweets for SenatorIsakson-slice1\n",
|
||||
"trying to fetch tweets for SenatorIsakson-slice2\n",
|
||||
"trying to fetch tweets for SenatorIsakson-slice3\n",
|
||||
"trying to fetch tweets for SenatorIsakson-slice4\n",
|
||||
"trying to fetch tweets for DougJones-slice1\n",
|
||||
"trying to fetch tweets for DougJones-slice2\n",
|
||||
"trying to fetch tweets for DougJones-slice3\n",
|
||||
"trying to fetch tweets for DougJones-slice4\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[24], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m handle \u001b[38;5;129;01min\u001b[39;00m accounts:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m slice_data \u001b[38;5;129;01min\u001b[39;00m time_slices:\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# define slice data variables from time_slices\u001b[39;00m\n\u001b[1;32m 6\u001b[0m start_time \u001b[38;5;241m=\u001b[39m slice_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstart_time\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Iterate over each Twitter account\n",
|
||||
"for handle in accounts:\n",
|
||||
" for slice_data in time_slices:\n",
|
||||
" time.sleep(1.01)\n",
|
||||
" # define slice data variables from time_slices\n",
|
||||
" start_time = slice_data['start_time']\n",
|
||||
" end_time = slice_data['end_time']\n",
|
||||
" suffix = slice_data['suffix']\n",
|
||||
" \n",
|
||||
" # define tweepy query with twitter handle of current sen\n",
|
||||
" query = f'from:{handle} -is:retweet'\n",
|
||||
" \n",
|
||||
" # create empty tweetlist that will be filled with tweets of current sen\n",
|
||||
" tweetlist = []\n",
|
||||
" \n",
|
||||
" # statusmsg\n",
|
||||
" msg = f'trying to fetch tweets for {handle}{suffix}'\n",
|
||||
" print(msg)\n",
|
||||
" \n",
|
||||
" # Fetch tweets using tweepy Twitter API v2 pagination with retry mechanism\n",
|
||||
" max_attempts = 3 # maximum number of attempts to fetch tweets for a slice\n",
|
||||
" attempt = 1\n",
|
||||
" \n",
|
||||
" while attempt <= max_attempts:\n",
|
||||
" try:\n",
|
||||
" tweets = tweepy.Paginator(client.search_all_tweets,\n",
|
||||
" query=query,\n",
|
||||
" tweet_fields=tweet_fields,\n",
|
||||
" start_time=start_time,\n",
|
||||
" end_time=end_time,\n",
|
||||
" max_results=20).flatten(20)\n",
|
||||
" \n",
|
||||
" # for each tweet returned...\n",
|
||||
" for tweet in tweets:\n",
|
||||
" # ... add that tweet to tweetlist\n",
|
||||
" tweetlist.append(tweet)\n",
|
||||
" \n",
|
||||
" # Check if no tweets fetched for the current time slice. If there are no tweets, skip to next time_slices loop iteration\n",
|
||||
" if len(tweetlist) == 0:\n",
|
||||
" msg = f'return empty in {handle}{suffix} - from {start_time} to {end_time}'\n",
|
||||
" print(msg)\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" # convert to dataframe\n",
|
||||
" tweet_df = pd.DataFrame(tweetlist)\n",
|
||||
" \n",
|
||||
" # add handle column as API only provides user-ids\n",
|
||||
" tweet_df['handle'] = handle\n",
|
||||
" \n",
|
||||
" ## Extract referenced_tweet info from column\n",
|
||||
" tweet_df['referenced_tweet_type'] = None\n",
|
||||
" tweet_df['referenced_tweet_id'] = None\n",
|
||||
" \n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'referenced_tweets' in tweet_df.columns:\n",
|
||||
" for index, row in tweet_df.iterrows():\n",
|
||||
" referenced_tweets = row['referenced_tweets']\n",
|
||||
" \n",
|
||||
" if isinstance(referenced_tweets, list) and len(referenced_tweets) > 0:\n",
|
||||
" referenced_tweet = referenced_tweets[0]\n",
|
||||
" referenced_tweet_type = referenced_tweet['type']\n",
|
||||
" referenced_tweet_id = referenced_tweet['id']\n",
|
||||
" \n",
|
||||
" tweet_df.at[index, 'referenced_tweet_type'] = referenced_tweet_type\n",
|
||||
" tweet_df.at[index, 'referenced_tweet_id'] = referenced_tweet_id\n",
|
||||
" \n",
|
||||
" ## Check if tweet-text contains keyword\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if 'text' in tweet_df.columns:\n",
|
||||
" tweet_df['contains_keyword'] = (tweet_df['text'].str.findall('|'.join(keywords))\n",
|
||||
" .str.join(',')\n",
|
||||
" .replace('', 'none'))\n",
|
||||
" \n",
|
||||
" ## Save two versions of the dataset, one with all fields and one without dict fields\n",
|
||||
" # define filepaths\n",
|
||||
" csv_path = f'data/tweets/{handle}{suffix}.csv'\n",
|
||||
" csv_path2 = f'data/tweets/{handle}{suffix}-LONG.csv'\n",
|
||||
" # save LONG csv\n",
|
||||
" tweet_df.to_csv(csv_path2)\n",
|
||||
" # Remove 'context_annotations', 'entities', and 'referenced_tweets' columns for short csv files\n",
|
||||
" # if cond. because in some cases column doesn't exist\n",
|
||||
" if all(k in tweet_df for k in ('context_annotations', 'entities', 'referenced_tweets')):\n",
|
||||
" tweet_df = tweet_df.drop(['context_annotations', 'entities', 'referenced_tweets'], axis=1)\n",
|
||||
" # save short csv\n",
|
||||
" tweet_df.to_csv(csv_path)\n",
|
||||
" \n",
|
||||
" # break out of the retry loop since fetching tweets was successful\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" except tweepy.TweepError as e:\n",
|
||||
" if e.response.status_code == 429: # rate limit exceeded\n",
|
||||
" reset_time = int(e.response.headers['x-rate-limit-reset'])\n",
|
||||
" wait_time = reset_time - time.time() + 5 # add additional 5 seconds as buffer\n",
|
||||
" \n",
|
||||
" print(f\"Rate limit exceeded. Sleeping for {wait_time} seconds.\")\n",
|
||||
" time.sleep(wait_time)\n",
|
||||
" \n",
|
||||
" attempt += 1 # increment the attempt counter\n",
|
||||
" else:\n",
|
||||
" print(f\"Error occurred: {e}\")\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "48688858-104d-4f2f-87b8-ed103f34b4e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Politics & Society",
|
||||
"language": "python",
|
||||
"name": "polsoc"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"toc-autonumbering": true,
|
||||
"toc-showmarkdowntxt": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
62
collect.py
62
collect.py
@ -4,9 +4,12 @@ Created on Thu Jun 8 01:08:21 2023
|
||||
|
||||
@author: Michael
|
||||
|
||||
collect.py scrapes tweets from senators of the us that were in office between
|
||||
2020 and the beginning of 2023.
|
||||
|
||||
# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
|
||||
|
||||
Following files are necessary:
|
||||
# Following files are necessary:
|
||||
funs/TimeSlice.py
|
||||
Function get_Tslices slices the defined timespan in config.py into N
|
||||
slices. Is necessary due to possible blocking of requests by twitter.
|
||||
@ -17,26 +20,32 @@ Following files are necessary:
|
||||
"keywords.txt".
|
||||
funs/Scrape.py
|
||||
scrapes using snscrape.modules.twitter. See docstring.
|
||||
data/keywords-raw.txt
|
||||
data/IN/keywords-raw.txt
|
||||
Contains all keywords that are used to detect whether a tweet contains
|
||||
information about Covid19.
|
||||
data/senators-raw.csv
|
||||
data/IN/senators-raw.csv
|
||||
Contains the senator dataset converted to csv. Is used to get the
|
||||
account-names of all senators twitter accounts.
|
||||
|
||||
Requirements:
|
||||
# Requirements:
|
||||
- snscrape 0.6.2.20230321+
|
||||
- pandas 2.0+
|
||||
The script will first import needed libraries.
|
||||
# IMPORTANT:
|
||||
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||
versions of snscrape will most likely fail to scrape all tweets because of
|
||||
certain rate limits or other errors that may occur.
|
||||
config.py will check whether snscrape is already installed. If not, it will try
|
||||
to install the included version automatically.
|
||||
Install snscrape from local git repo to make shure that it fits the used version.
|
||||
If snscrape is shall be installed from local repo, uncomment the following lines:
|
||||
|
||||
How to use:
|
||||
- To run the script, first adjust the config.py file.
|
||||
import subprocess
|
||||
os.chdir('snscrape/')
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||
os.chdir(wd)
|
||||
|
||||
|
||||
# How to use:
|
||||
- To run the script, first adjust the options found in the following lines.
|
||||
- config.py will check whether snscrape is already installed. If not, it will try
|
||||
to install the included version automatically.
|
||||
- run the script
|
||||
@ -57,7 +66,6 @@ which is the final output.
|
||||
import os
|
||||
import pandas as pd
|
||||
import glob
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import concurrent.futures
|
||||
@ -82,7 +90,7 @@ file_alltweets = "ALL-SENATORS-TWEETS.csv"
|
||||
path_to_tweetdfs = wd + td
|
||||
|
||||
# Name of logfile
|
||||
logfile = wd+"log/log_"
|
||||
logfile = f"{wd}log/log_"
|
||||
|
||||
###################
|
||||
# Define Timespan & time-format
|
||||
@ -97,16 +105,6 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||
maxTweets = 5000
|
||||
|
||||
###################
|
||||
# Install snscrape from local git repo to make shure that it fits the used version.
|
||||
# If snscrape is already installed, uncomment the following lines:
|
||||
"""
|
||||
import subprocess
|
||||
os.chdir('snscrape/')
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||
os.chdir(wd)
|
||||
"""
|
||||
|
||||
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||
# get subparams just like in user where user id can be obtained by user.id
|
||||
@ -146,10 +144,16 @@ tweetDFColumns = [
|
||||
"source",
|
||||
]
|
||||
|
||||
## Import functions
|
||||
from funs.TimeSlice import *
|
||||
from funs.ClearDupes import deDupe
|
||||
from funs.Scrape import scrapeTweets
|
||||
#############################################################################
|
||||
################## do NOT change anything below this line ###################
|
||||
#############################################################################
|
||||
|
||||
## Import own functions
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
from TimeSlice import get_Tslices
|
||||
from ClearDupes import deDupe
|
||||
from Scrape import scrapeTweets
|
||||
|
||||
###################
|
||||
# Create logfile & log all outputs
|
||||
@ -186,8 +190,8 @@ print("---")
|
||||
###################
|
||||
# Senator Accounts
|
||||
# Get accounts & alt-accounts from Senators-Datafile
|
||||
accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
|
||||
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
|
||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||
accounts.extend(alt_accounts)
|
||||
|
||||
@ -248,7 +252,7 @@ with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w"
|
||||
if file not in tweetfiles:
|
||||
fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
|
||||
else:
|
||||
fout.write('all slices scraped.')
|
||||
fout.write(f'{file:<30}:all slices scraped.\n')
|
||||
|
||||
## Merge .csv files.
|
||||
# check if file_alltweets (previously scraped tweets that have been merged
|
||||
@ -269,6 +273,8 @@ if tweetfiles:
|
||||
fout.write(f.read())
|
||||
os.chdir(wd) # go back to wd
|
||||
|
||||
###################
|
||||
# finish logging
|
||||
# Report timing info.
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
|
166
collectSenData.py
Normal file
166
collectSenData.py
Normal file
@ -0,0 +1,166 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Jun 23 21:49:11 2023
|
||||
|
||||
@author: Michael
|
||||
|
||||
collectSenData.py scrapes accounts of senators for the following data:the
|
||||
number of followers, the number of users the twitter account is following,
|
||||
and how long the twitter account has existed.
|
||||
|
||||
# Requirements:
|
||||
- snscrape 0.6.2.20230321+
|
||||
- pandas 2.0+
|
||||
# IMPORTANT:
|
||||
This script uses snscrape Version 0.6.2.20230321.dev50+g0d824ab which is
|
||||
included in 'snscrape/' as a git repository for better reproducibility. Earlier
|
||||
versions of snscrape will most likely fail to scrape all tweets because of
|
||||
certain rate limits or other errors that may occur.
|
||||
Install snscrape from local git repo to make shure that it fits the used version.
|
||||
If snscrape is shall be installed from local repo, uncomment the following lines:
|
||||
|
||||
import subprocess
|
||||
os.chdir('snscrape/')
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
|
||||
os.chdir(wd)
|
||||
|
||||
|
||||
# How to use:
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import glob
|
||||
import time
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import concurrent.futures
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
|
||||
# Name of logfile
|
||||
logfile = wd+"log/UserLog_"
|
||||
|
||||
###################
|
||||
# Define Timespan & time-format
|
||||
# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
|
||||
ts_beg = "2020-01-01T00:00:00Z" # start of scraping
|
||||
ts_end = "2023-01-03T00:00:00Z" # end of straping
|
||||
no_slices = 24 # Number of slices / time periods.
|
||||
|
||||
# file time format
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
|
||||
# Maximum tweets to be scraped by snscrape. Can be left untouched.
|
||||
maxTweets = 5000
|
||||
|
||||
# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
|
||||
# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
|
||||
# get subparams just like in user where user id can be obtained by user.id
|
||||
userDFColumns = [
|
||||
"id",
|
||||
"username",
|
||||
"followersCount",
|
||||
"friendsCount",
|
||||
"verified",
|
||||
"created"
|
||||
]
|
||||
|
||||
#############################################################################
|
||||
################## do NOT change anything below this line ###################
|
||||
#############################################################################
|
||||
|
||||
from funs.Scrape import scrapeUsers, getHandles, printHandles
|
||||
from funs.TimeSlice import convertTime
|
||||
|
||||
|
||||
###################
|
||||
# Create logfile & log all outputs
|
||||
# there are three logfile types to be found in /log.
|
||||
# should be self explanatory.
|
||||
logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
|
||||
logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
|
||||
sys.stderr = open(logfileErrors, "w")
|
||||
sys.stdout = open(logfilen, "w")
|
||||
|
||||
|
||||
###################
|
||||
# Senator Accounts
|
||||
# Get accounts & alt-accounts from Senators-Datafile
|
||||
accounts = getHandles(di)
|
||||
|
||||
# Print accounts to be scraped
|
||||
print(printHandles(accounts))
|
||||
|
||||
###################
|
||||
# Scraping
|
||||
# report time:
|
||||
timeStartScrape = datetime.now()
|
||||
print("Starting scraping at:")
|
||||
print(timeStartScrape.strftime(fTimeFormat))
|
||||
print("---")
|
||||
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
listUsers = []
|
||||
# Iterate over each Twitter account using multiprocessing
|
||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
# List to store the scraping tasks
|
||||
tasks = []
|
||||
for handle in accounts:
|
||||
# Schedule the scraping task
|
||||
task = executor.submit(
|
||||
scrapeUsers, handle, userDFColumns
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete and retrieve results
|
||||
for task in concurrent.futures.as_completed(tasks):
|
||||
result = task.result()
|
||||
listUsers.append(result)
|
||||
|
||||
dfUsers = pd.DataFrame(listUsers, columns=userDFColumns)
|
||||
dfUsers.to_csv(senCSVPath, encoding='utf-8')
|
||||
|
||||
# report time:
|
||||
timeEndScrape = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndScrape.strftime(fTimeFormat))
|
||||
|
||||
# Report timing info.
|
||||
timeEndMerge = datetime.now()
|
||||
print("---")
|
||||
print("End of scraping at:")
|
||||
print(timeEndMerge.strftime(fTimeFormat))
|
||||
print("---")
|
||||
# calulate times:
|
||||
tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
|
||||
tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
|
||||
tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
|
||||
print(
|
||||
f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
|
||||
)
|
||||
print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
|
||||
print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
|
||||
|
||||
print(listUsers)
|
||||
# close connection to logfiles.
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
144
createGraphs.py
Normal file
144
createGraphs.py
Normal file
@ -0,0 +1,144 @@
|
||||
#%%
|
||||
#!/usr/bin/env python3
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from wordcloud import WordCloud
|
||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||
import string
|
||||
#%%
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 26 20:36:43 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
# import pyreadstat
|
||||
# import numpy as np
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final.csv"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov.csv"
|
||||
|
||||
# Outfiles
|
||||
wcAllTweetsF = "graphs/Wordcloud-All.png"
|
||||
wcCovTweetsF = "graphs/Wordcloud-Cov.png"
|
||||
TwCovTimeline = "graphs/Timeline.png"
|
||||
|
||||
# don't change this one
|
||||
senCSVcPath = wd + ud + senCSVc
|
||||
senCSVcCovPath = wd + ud + senCSVcCov
|
||||
wcAllTweetsFPath = wd + ud + wcAllTweetsF
|
||||
wcCovTweetsFPath = wd + ud + wcCovTweetsF
|
||||
TwCovTimelinePath = wd + ud + TwCovTimeline
|
||||
|
||||
#%%
|
||||
df = pd.read_csv(senCSVcPath, dtype=(object))
|
||||
dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
|
||||
#%%
|
||||
df['cleanContent'] = df['rawContent'].apply(remove_URL)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_html)
|
||||
df['cleanContent'] = df['cleanContent'].apply(remove_punct)
|
||||
|
||||
# create string with all cleaned tweets as text
|
||||
str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||
#%%
|
||||
dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
|
||||
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
|
||||
|
||||
# create string with all cleaned tweets as text
|
||||
str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
|
||||
#%%
|
||||
# replace single U and S characters
|
||||
str_covtweets = str_covtweets.replace(' u ', ' ')
|
||||
str_covtweets = str_covtweets.replace(' s ', ' ')
|
||||
str_alltweets = str_alltweets.replace(' u ', ' ')
|
||||
str_alltweets = str_alltweets.replace(' s ', ' ')
|
||||
|
||||
|
||||
# %%
|
||||
# create wordcloud alltweets
|
||||
wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||
wcA.generate(str_alltweets)
|
||||
|
||||
#%%
|
||||
# draw
|
||||
plt.figure( figsize=(20,20))
|
||||
plt.axis("off")
|
||||
plt.imshow(wcA, interpolation="bilinear")
|
||||
fig1 = plt.gcf()
|
||||
plt.show()
|
||||
fig1.savefig(wcAllTweetsFPath)
|
||||
|
||||
# %%
|
||||
# create wordcloud covtweets
|
||||
wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
|
||||
wcC.generate(str_covtweets)
|
||||
#%%
|
||||
# draw
|
||||
plt.figure( figsize=(20,20))
|
||||
plt.axis("off")
|
||||
plt.imshow(wcC, interpolation="bilinear")
|
||||
fig2 = plt.gcf()
|
||||
plt.show()
|
||||
fig2.savefig(wcCovTweetsFPath)
|
||||
# %%
|
||||
# with open('test.txt', 'w') as f:
|
||||
# f.write(str_covtweets)
|
||||
# %%
|
||||
dfT = pd.DataFrame()
|
||||
dfT['date'] = df['date'].copy()
|
||||
dfT['count'] = 1
|
||||
|
||||
dfCovT = pd.DataFrame()
|
||||
dfCovT['date'] = dfCov['date'].copy()
|
||||
dfCovT['count'] = 1
|
||||
#%%
|
||||
dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
|
||||
dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
|
||||
|
||||
#%%
|
||||
dfT = dfT.groupby('date').count().reset_index()
|
||||
dfCovT = dfCovT.groupby('date').count().reset_index()
|
||||
|
||||
#%%
|
||||
import matplotlib.dates as mdates
|
||||
# n of tweets overall
|
||||
my_dpi=300
|
||||
plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
|
||||
plt.style.use('seaborn-darkgrid')
|
||||
fig, ax = plt.subplots(figsize=(8, 6))
|
||||
ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
|
||||
ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
|
||||
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
|
||||
ax.xaxis.set_minor_locator(mdates.MonthLocator())
|
||||
fig.autofmt_xdate()
|
||||
fig.savefig(TwCovTimelinePath)
|
||||
|
||||
|
||||
# %%
|
0
data/IN/.gitkeep
Normal file
0
data/IN/.gitkeep
Normal file
23
data/IN/counterKeywords.txt
Normal file
23
data/IN/counterKeywords.txt
Normal file
@ -0,0 +1,23 @@
|
||||
opioid
|
||||
gun violence
|
||||
gun-violence
|
||||
CHD
|
||||
Coronary heart disease
|
||||
addiction
|
||||
tobacco
|
||||
vaping
|
||||
e-cigarette
|
||||
shooting
|
||||
indigenous women
|
||||
overdose
|
||||
meth
|
||||
cocaine
|
||||
separated children
|
||||
separating children
|
||||
separating families
|
||||
Muslim travel ban
|
||||
flu-season
|
||||
flu season
|
||||
Soleimani
|
||||
Muslim Ban
|
||||
USMCA trade deal
|
23
data/IN/counterKeywordsFinal.txt
Normal file
23
data/IN/counterKeywordsFinal.txt
Normal file
@ -0,0 +1,23 @@
|
||||
meth
|
||||
gun violence
|
||||
flu season
|
||||
vaping
|
||||
chd
|
||||
addiction
|
||||
indigenous women
|
||||
separating children
|
||||
tobacco
|
||||
e-cigarette
|
||||
muslim ban
|
||||
soleimani
|
||||
cocaine
|
||||
separating families
|
||||
muslim travel ban
|
||||
usmca trade deal
|
||||
shooting
|
||||
overdose
|
||||
separated children
|
||||
coronary heart disease
|
||||
gun-violence
|
||||
opioid
|
||||
flu-season
|
190
data/IN/keywords.txt
Normal file
190
data/IN/keywords.txt
Normal file
@ -0,0 +1,190 @@
|
||||
plandemic
|
||||
scamdemic
|
||||
wuhan flu
|
||||
wuhanflu
|
||||
corona
|
||||
coronavirusoutbreak
|
||||
pandemic
|
||||
epidemic
|
||||
vax
|
||||
antivax
|
||||
antivaxxers
|
||||
wearamask
|
||||
masksoff
|
||||
cdc
|
||||
ncov
|
||||
sars-cov-2
|
||||
socialdistancing
|
||||
wear a mask
|
||||
lockdown
|
||||
covd
|
||||
coronavirus
|
||||
koronavirus
|
||||
corona
|
||||
cdc
|
||||
wuhancoronavirus
|
||||
wuhanlockdown
|
||||
ncov
|
||||
wuhan
|
||||
n95
|
||||
kungflu
|
||||
epidemic
|
||||
outbreak
|
||||
sinophobia
|
||||
covid-19
|
||||
corona virus
|
||||
covid
|
||||
covid19
|
||||
sars-cov-2
|
||||
covidー19
|
||||
covd
|
||||
pandemic
|
||||
coronapocalypse
|
||||
canceleverything
|
||||
coronials
|
||||
socialdistancingnow
|
||||
social distancing
|
||||
socialdistancing
|
||||
panicbuy
|
||||
panic buy
|
||||
panicbuying
|
||||
panic buying
|
||||
14dayquarantine
|
||||
duringmy14dayquarantine
|
||||
panic shop
|
||||
panic shopping
|
||||
panicshop
|
||||
inmyquarantinesurvivalkit
|
||||
panic-buy
|
||||
panic-shop
|
||||
coronakindness
|
||||
quarantinelife
|
||||
chinese virus
|
||||
chinesevirus
|
||||
stayhomechallenge
|
||||
stay home challenge
|
||||
sflockdown
|
||||
dontbeaspreader
|
||||
lockdown
|
||||
lock down
|
||||
shelteringinplace
|
||||
sheltering in place
|
||||
staysafestayhome
|
||||
stay safe stay home
|
||||
trumppandemic
|
||||
trump pandemic
|
||||
flattenthecurve
|
||||
flatten the curve
|
||||
china virus
|
||||
chinavirus
|
||||
quarentinelife
|
||||
ppeshortage
|
||||
saferathome
|
||||
stayathome
|
||||
stay at home
|
||||
stay home
|
||||
stayhome
|
||||
getmeppe
|
||||
covidiot
|
||||
epitwitter
|
||||
pandemie
|
||||
wear a mask
|
||||
wearamask
|
||||
kung flu
|
||||
covididiot
|
||||
covid__19
|
||||
omicron
|
||||
variant
|
||||
vaccine
|
||||
travel ban
|
||||
corona
|
||||
corona
|
||||
coronavirus
|
||||
coronavirus
|
||||
covid
|
||||
covid
|
||||
covid19
|
||||
covid19
|
||||
covid-19
|
||||
covid-19
|
||||
sarscov2
|
||||
sarscov2
|
||||
sars cov2
|
||||
sars cov 2
|
||||
covid_19
|
||||
covid_19
|
||||
ncov
|
||||
ncov
|
||||
ncov2019
|
||||
ncov2019
|
||||
2019-ncov
|
||||
2019-ncov
|
||||
pandemic
|
||||
pandemic 2019ncov
|
||||
2019ncov
|
||||
quarantine
|
||||
quarantine
|
||||
flatten the curve
|
||||
flattening the curve
|
||||
flatteningthecurve
|
||||
flattenthecurve
|
||||
hand sanitizer
|
||||
handsanitizer
|
||||
lockdown
|
||||
lockdown
|
||||
social distancing
|
||||
socialdistancing
|
||||
work from home
|
||||
workfromhome
|
||||
working from home
|
||||
workingfromhome
|
||||
n95
|
||||
n95
|
||||
covidiots
|
||||
covidiots
|
||||
herd immunity
|
||||
herdimmunity
|
||||
pneumonia
|
||||
pneumonia
|
||||
chinese virus
|
||||
chinesevirus
|
||||
wuhan virus
|
||||
wuhanvirus
|
||||
kung flu
|
||||
kungflu
|
||||
wearamask
|
||||
wearamask
|
||||
wear a mask
|
||||
vaccine
|
||||
vaccines
|
||||
vaccine
|
||||
vaccines
|
||||
corona vaccine
|
||||
corona vaccines
|
||||
coronavaccine
|
||||
coronavaccines
|
||||
face shield
|
||||
faceshield
|
||||
face shields
|
||||
faceshields
|
||||
health worker
|
||||
healthworker
|
||||
health workers
|
||||
healthworkers
|
||||
stayhomestaysafe
|
||||
coronaupdate
|
||||
frontlineheroes
|
||||
coronawarriors
|
||||
homeschool
|
||||
homeschooling
|
||||
hometasking
|
||||
masks4all
|
||||
wfh
|
||||
wash ur hands
|
||||
wash your hands
|
||||
washurhands
|
||||
washyourhands
|
||||
stayathome
|
||||
stayhome
|
||||
selfisolating
|
||||
self isolating
|
20
data/IN/own_keywords.txt
Normal file
20
data/IN/own_keywords.txt
Normal file
@ -0,0 +1,20 @@
|
||||
plandemic
|
||||
scamdemic
|
||||
wuhan flu
|
||||
wuhanflu
|
||||
corona
|
||||
coronavirusoutbreak
|
||||
pandemic
|
||||
epidemic
|
||||
vax
|
||||
antivax
|
||||
antivaxxers
|
||||
wearamask
|
||||
masksoff
|
||||
cdc
|
||||
ncov
|
||||
sars-cov-2
|
||||
socialdistancing
|
||||
wear a mask
|
||||
lockdown
|
||||
covd
|
50
data/IN/pretest-tweets_fake.txt
Normal file
50
data/IN/pretest-tweets_fake.txt
Normal file
@ -0,0 +1,50 @@
|
||||
1486474031419297799
|
||||
1504880316506263552
|
||||
1264663210197745665
|
||||
1479500294887256069
|
||||
1320058585590734852
|
||||
1539003407096336388
|
||||
1481704942574395392
|
||||
1572014646374154240
|
||||
1524764580806811649
|
||||
1592940763515858944
|
||||
1554529221594292224
|
||||
1479488991347023876
|
||||
1481715928492609541
|
||||
1476722414100914179
|
||||
1478478958740086790
|
||||
1459285859358982148
|
||||
1475620600228028432
|
||||
1479459200229117955
|
||||
1448386057339297797
|
||||
1468993886316077063
|
||||
1448369102318362625
|
||||
1444354461799956482
|
||||
1431340411193331715
|
||||
1583474056011010048
|
||||
1450479481278406658
|
||||
1396992539010469894
|
||||
1396992534623174658
|
||||
1417920232333656076
|
||||
1439553348122861568
|
||||
1598398871990079489
|
||||
1502768541979881479
|
||||
1337604370981134336
|
||||
1417797808707473410
|
||||
1601693432292192256
|
||||
1598145048989704192
|
||||
1599906362380591110
|
||||
1325851780496961538
|
||||
1468908159330885632
|
||||
1468332389923311616
|
||||
1339703372505624577
|
||||
1468633243654451200
|
||||
1488290848907444240
|
||||
1491146722625880064
|
||||
1481766558313730053
|
||||
1503078235373985795
|
||||
1485398845718773762
|
||||
1371501907483754497
|
||||
1494398809245376513
|
||||
1436328255959801865
|
||||
1482862501461209089
|
50
data/IN/pretest-tweets_not_fake.txt
Normal file
50
data/IN/pretest-tweets_not_fake.txt
Normal file
@ -0,0 +1,50 @@
|
||||
1258402212327436288
|
||||
1489758168750174209
|
||||
1303698927766646785
|
||||
1257681474670809090
|
||||
1340109389672411136
|
||||
1303698924444803072
|
||||
1303698926902665218
|
||||
1337595387796983809
|
||||
1344441446515019777
|
||||
1385680800218324992
|
||||
1590129838261956608
|
||||
1303698928609697796
|
||||
1348715183502454793
|
||||
1340418291274289153
|
||||
1421228572732280835
|
||||
1456349962942533637
|
||||
1603457599877308416
|
||||
1278354646885687296
|
||||
1340418294579421188
|
||||
1365866032792039425
|
||||
1472722005657112578
|
||||
1381021635772350464
|
||||
1337598897217220609
|
||||
1354797645261398016
|
||||
1266806429282963456
|
||||
1429847265242460161
|
||||
1234272677633953792
|
||||
1301581247932772352
|
||||
1424832183148204043
|
||||
1339255967809212416
|
||||
1284831896988454912
|
||||
1463528081214394377
|
||||
1453679912938885122
|
||||
1583474059148337152
|
||||
1519791965113622528
|
||||
1470775155110682628
|
||||
1464615554103357450
|
||||
1337595385565638657
|
||||
1436055743418019840
|
||||
1572208051830104069
|
||||
1433765113891328002
|
||||
1482774656075534336
|
||||
1310288545886736384
|
||||
1353845938566156289
|
||||
1396992537202659329
|
||||
1455712525362810883
|
||||
1340384267327647747
|
||||
1338588364459618305
|
||||
1376696928692412419
|
||||
1340386565399429123
|
@ -1,112 +1,111 @@
|
||||
name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female, ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
|
||||
"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander ,LamarAlexander ,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
|
||||
"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi?lang=zh-Hant ,SenatorEnzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
|
||||
name,id,state,state_short,party,class,ideology,start_serving,end_serving,time_in_office,not_in_office,last_congress,vote_share,next_closest_share,election_year,twitter_url,twitter_handle,alt_account,alt_handle,date_of_birth,female,ethnicity,edu_level,edu_information,occup_level,website_url,bioguide_link,Comments_1,Comments_2
|
||||
"Alexander, Andrew L., Jr.",1,Tennessee,TN,0,2,0.681815808318192,01/07/2003,01/03/2021,18.0027397260274,1,116,61.9,31.8,2014,https://twitter.com/SenAlexander,SenAlexander,https://twitter.com/LamarAlexander,LamarAlexander,07/03/1940,0,White,8,J.D.; New York Univeristy; 1965,2,N/A,https://bioguide.congress.gov/search/bio/A000360,,
|
||||
"Enzi, Mike",2,Wyoming,WY,0,2,0.719285383539398,01/03/1997,01/03/2021,24,1,116,72.3,17.6,2014,https://twitter.com/senatorenzi,senatorenzi,N/A,N/A,02/01/1944,0,White,7,M.B.A.; Retail Marketing; Denver University; 1968,4,N/A,https://bioguide.congress.gov/search/bio/E000285,,
|
||||
"Gardner, Cory",3,Colorado,CO,0,2,0.719285383539398,01/06/2015,01/03/2021,5.9972602739726,1,116,48.5,46,2014,https://twitter.com/CoryGardner,CoryGardner,https://twitter.com/corygardner,corygardner,08/22/1974,0,White,8,"J.D.; University of Colorado, Boulder; 2001",2,N/A,https://bioguide.congress.gov/search/bio/G000562,,
|
||||
"Harris, Kamala",4,California ,CA,1,3,0.0213759569468058,01/03/2017,01/18/2021,4.04383561643836,1,116,62.4,37.6,2016,https://twitter.com/VP,VP,https://twitter.com/KamalaHarris,KamalaHarris,10/20/1964,1,African-American; Asian-American,8,J.D.; University of California; 1989,2,N/A,https://bioguide.congress.gov/search/bio/H001075,(became VP on jan 20 2021),
|
||||
"Isakson, John",5,Georgia,GA,0,3,*,01/03/2005,12/31/2019,14,1,116,55,40.8,2016,https://twitter.com/SenatorIsakson ,SenatorIsakson,N/A,N/A,12/28/1944,0,White,6,"University of Georgia, Athens; 1966",1,N/A,https://bioguide.congress.gov/search/bio/I000055,(died in 2019),
|
||||
"Jones, Gordon Douglas",6,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
|
||||
"Loeffler, Kelly",7,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler ,senatorloeffler ,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
|
||||
"McSally, Martha",8,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
|
||||
"Perdue, David",9,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
|
||||
"Roberts, Charles Patrick",10,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
|
||||
"Udall, Tom",11,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
|
||||
"Baldwin, Tammy",12,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
|
||||
"Barrasso, John",13,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
|
||||
"Bennet, Michael F.",14,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
|
||||
"Blackburn, Marsha",15,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
|
||||
"Blumenthal, Richard",16,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
|
||||
"Blunt, Roy",17,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
|
||||
"Booker, Cory A.",18,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
|
||||
"Boozman, John",19,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
|
||||
"Braun, Michael",20,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
|
||||
"Brown, Sherrod",21,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
|
||||
"Burr, Richard",22,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
|
||||
"Cantwell, Maria",23,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
|
||||
"Capito, Shelley Moore",24,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
|
||||
"Cardin, Benjamin L.",25,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
|
||||
"Carper, Thomas R.",26,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
|
||||
"Casey, Robert P., Jr.",27,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
|
||||
"Cassidy, Bill",28,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
|
||||
"Collins, Susan M.",29,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
|
||||
"Coons, Christopher A.",30,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
|
||||
"Cornyn, John",31,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary<72>s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
|
||||
"Cortez Masto, Catherine",32,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
|
||||
"Cotton, Tom",33,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
|
||||
"Cramer, Kevin",34,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
|
||||
"Crapo, Michael",35,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
|
||||
"Cruz, Ted",36,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
|
||||
"Daines, Steve",37,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
|
||||
"Duckworth, Tammy",38,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
|
||||
"Durbin, Richard J.",39,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
|
||||
"Ernst, Joni",40,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
|
||||
"Feinstein, Dianne",41,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
|
||||
"Fischer, Debra",42,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
|
||||
"Gillibrand, Kirsten E.",43,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
|
||||
"Graham, Lindsey",44,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
|
||||
"Grassley, Chuck",45,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
|
||||
"Hagerty, Bill",46,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
|
||||
"Hassan, Margaret Wood",47,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
|
||||
"Hawley, Josh",48,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
|
||||
"Heinrich, Martin",49,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,N/A,N/A,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
|
||||
"Hickenlooper, John W.",50,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
|
||||
"Hirono, Mazie K.",51,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
|
||||
"Hoeven, John",52,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
|
||||
"Hyde-Smith, Cindy",53,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
|
||||
"Inhofe, James",54,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
|
||||
"Johnson, Ron",55,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
|
||||
"Kaine, Tim",56,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
|
||||
"Kelly, Mark",57,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
|
||||
"Kennedy, John Neely",58,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
|
||||
"King, Angus S., Jr.",59,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
|
||||
"Klobuchar, Amy",60,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
|
||||
"Lankford, James",61,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
|
||||
"Leahy, Patrick",62,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
|
||||
"Lee, Mike",63,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
|
||||
"Luj<EFBFBD>n, Ben Ray",64,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
|
||||
"Lummis, Cynthia M.",65,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
|
||||
"Manchin, Joe, III",66,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
|
||||
"Markey, Edward J.",67,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
|
||||
"Marshall, Roger",68,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
|
||||
"McConnell, Mitch",69,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
|
||||
"Menendez, Robert",70,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
|
||||
"Merkley, Jeff",71,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
|
||||
"Moran, Jerry",72,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
|
||||
"Murkowski, Lisa",73,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
|
||||
"Murphy, Christopher",74,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
|
||||
"Murray, Patty",75,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
|
||||
"Ossoff, Jon",76,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
|
||||
"Padilla, Alex",77,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
|
||||
"Paul, Rand",78,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
|
||||
"Peters, Gary C.",79,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
|
||||
"Portman, Robert",80,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
|
||||
"Reed, John F.",81,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
|
||||
"Risch, James E.",82,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
|
||||
"Romney, Mitt",83,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
|
||||
"Rosen, Jacky",84,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
|
||||
"Rounds, Mike",85,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
|
||||
"Rubio, Marco",86,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
|
||||
"Sanders, Bernard",87,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
|
||||
"Sasse, Benjamin",88,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
|
||||
"Schatz, Brian",89,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
|
||||
"Schumer, Charles E.",90,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
|
||||
"Scott, Rick",91,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
|
||||
"Scott, Tim",92,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
|
||||
"Shaheen, Jeanne",93,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
|
||||
"Shelby, Richard",94,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
|
||||
"Sinema, Kyrsten",95,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
|
||||
"Smith, Tina",96,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
|
||||
"Stabenow, Debbie",97,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
|
||||
"Sullivan, Dan",98,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
|
||||
"Tester, Jon",99,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
|
||||
"Thune, John",100,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
|
||||
"Tillis, Thom",101,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
|
||||
"Toomey, Patrick",102,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
|
||||
"Tuberville, Tommy",103,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
|
||||
"Van Hollen, Chris",104,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
|
||||
"Warner, Mark R.",105,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
|
||||
"Warnock, Raphael G.",106,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
|
||||
"Warren, Elizabeth",107,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
|
||||
"Whitehouse, Sheldon",108,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
|
||||
"Wicker, Roger F.",109,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
|
||||
"Wyden, Ron",110,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
|
||||
"Young, Todd",111,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
|
||||
"Jones, Gordon Douglas",5,Alabama,AL,1,2,0.632885678298333,01/03/2018,01/03/2021,3.0027397260274,1,116,49.9,48.4,2017,https://twitter.com/DougJones,DougJones,N/A,N/A,05/04/1954,0,White,8,"J.D.; Samford University, Cumberland School of Law; 1979",2,N/A,https://bioguide.congress.gov/search/bio/J000300/,special election to replace Jeff Sessions,
|
||||
"Loeffler, Kelly",6,Georgia,GA,0,2,0.904293903291947,01/06/2020,01/20/2021,1.04109589041096,1,116,N/A,N/A,*,https://twitter.com/KLoeffler,KLoeffler,https://twitter.com/senatorloeffler,senatorloeffler,11/27/1970,1,White,7,M.B.A.; Internationla Finance and Marketing; DePaul University Chicago; 1999,1,N/A,https://bioguide.congress.gov/search/bio/L000594,Appointed in 2019 after the resignation of Johnny Isakson but lost the 2020 election,
|
||||
"McSally, Martha",7,Arizona,AZ,0,2,*,01/03/2015,01/03/2019,1,1,116,N/A,N/A,*,https://twitter.com/MarthaMcSallyAZ,MarthaMcSallyAZ,https://twitter.com/marthamcsally,marthamcsally,03/22/1966,1,White,7,M.P.P.; John F. Kennedy School of Government,3,N/A,https://bioguide.congress.gov/search/bio/M001197,(left office Dec 2 2020),appointed in 2018 after death of John McCain but lot 2020 election
|
||||
"Perdue, David",8,Georgia,GA,0,2,0.914979462126755,01/06/2015,01/03/2021,5.9972602739726,1,116,53,45.1,2014,https://twitter.com/DavidPerdueGA,DavidPerdueGA,https://twitter.com/sendavidperdue,sendavidperdue,12/10/1949,0,White,7,M.S.; Georgia Institute of Technology; 1976,1,N/A,https://bioguide.congress.gov/search/bio/P000612,,
|
||||
"Roberts, Charles Patrick",9,Kansas,KS,0,2,0.822995787870405,01/07/1997,01/03/2021,24.0054794520548,1,116,53.3,42.5,2014,https://twitter.com/SenPatRoberts,SenPatRoberts,https://twitter.com/PatRoberts,PatRoberts,04/20/1936,0,White,6,"B.A.; Kansas State university, Manhattan; 1958",7,N/A,https://bioguide.congress.gov/search/bio/R000307,,
|
||||
"Udall, Tom",10,New Mexico,NM,1,2,0.259828450248573,01/06/2009,01/03/2021,12,1,116,55.4,44.6,2014,https://twitter.com/SenatorTomUdall,SenatorTomUdall,https://twitter.com/tomudall,tomudall,05/18/1948,0,White,8,"J.D.; University of New Mexico School of Law, Albuquerque, N.M.; 1977",2,N/A,https://bioguide.congress.gov/search/bio/U000039,,
|
||||
"Baldwin, Tammy",11,Wisconsin,WI,1,1,0.176999238019796,01/03/2013,12/31/2022,9.9972602739726,0,117,55.4,44.6,2018,https://twitter.com/SenatorBaldwin,SenatorBaldwin,https://twitter.com/tammybaldwin,tammybaldwin,02/11/1962,1,White,8,"J.D.; University of Wisconsin, Madison; 1989",2,https://www.baldwin.senate.gov/,https://bioguide.congress.gov/search/bio/B001230,,
|
||||
"Barrasso, John",12,Wyoming,WY,0,1,0.817902617377421,06/22/2007,12/31/2022,15.5369863013699,0,117,67.1,30.1,2018,https://twitter.com/SenJohnBarrasso,SenJohnBarrasso,https://twitter.com/barrassoforwyo,barrassoforwyo,07/21/1952,0,White,7,M.D.; Georgetown University School of Medicine; 1978,6,https://www.barrasso.senate.gov/,https://bioguide.congress.gov/search/bio/B001261,,
|
||||
"Bennet, Michael F.",13,Colorado,CO,1,3,0.248044568735702,01/21/2009,12/31/2022,13.9506849315069,0,117,49.1,45.4,2016,https://twitter.com/SenatorBennet,SenatorBennet,https://twitter.com/michaelbennet,michaelbennet,11/28/1964,0,White,8,J.D.; Yale Law School; 1993,2,https://www.bennet.senate.gov/,https://bioguide.congress.gov/search/bio/B001267,,
|
||||
"Blackburn, Marsha",14,Tennessee,TN,0,1,0.93228239890635,01/03/2019,12/31/2022,3.99452054794521,0,117,54.7,43.9,2018,https://twitter.com/MarshaBlackburn,MarshaBlackburn,N/A,N/A,06/06/1952,1,White,6,"B.S.; Home Economics; Mississippi State University, Starkville; 1973",1,https://www.blackburn.senate.gov/,https://bioguide.congress.gov/search/bio/B001243,,
|
||||
"Blumenthal, Richard",15,Connecticut,CT,1,3,0.0310655954121906,01/03/2010,12/31/2022,13,0,117,62.9,34.9,2016,https://twitter.com/SenBlumenthal,SenBlumenthal,N/A,N/A,02/13/1946,0,White,8,J.D.; Yale University; 1973,2,https://www.blumenthal.senate.gov/,https://bioguide.congress.gov/search/bio/B001277,,
|
||||
"Blunt, Roy",16,Missouri,MO,0,3,0.584409139223541,01/03/2011,12/31/2022,12,1,117,49.4,46.2,2016,https://twitter.com/RoyBlunt,RoyBlunt,N/A,N/A,01/10/1950,0,White,7,"M.A.; Missouri State University ,Springfield; 1972",5,N/A,https://bioguide.congress.gov/search/bio/B000575,,
|
||||
"Booker, Cory A.",17,New Jersey,NJ,1,2,0.0455802980872292,10/31/2013,12/31/2022,12,0,117,57.2,40.9,2020,https://twitter.com/senbooker,senbooker,https://twitter.com/CoryBooker,CoryBooker,04/27/1969,0,African-American; Asian-American,8,J.D.; Yale Law School; 1997,2,https://www.booker.senate.gov/,https://bioguide.congress.gov/search/bio/B001288,,
|
||||
"Boozman, John",18,Arkansas,AR,0,3,0.768699282926499,01/05/2011,12/31/2022,11.9945205479452,0,117,59.8,36.2,2016,https://twitter.com/JohnBoozman,JohnBoozman,N/A,N/A,12/10/1950,0,White,6,Southern College of Optometry; 1977,6,https://www.boozman.senate.gov/,https://bioguide.congress.gov/search/bio/B001236,,
|
||||
"Braun, Michael",19,Indiana,IN,0,1,0.98106874319906,01/03/2019,12/31/2022,3.99452054794521,0,117,50.9,45,2018,https://twitter.com/SenatorBraun,SenatorBraun,N/A,N/A,03/24/1954,0,White,7,M.B.A.; Harvard Business School; 1978,1,https://www.braun.senate.gov/,https://bioguide.congress.gov/search/bio/B001310,,
|
||||
"Brown, Sherrod",20,Ohio,OH,1,1,0.0923940264109351,01/04/2007,12/31/2022,16,0,117,53.4,46.6,2018,https://twitter.com/SenSherrodBrown,SenSherrodBrown,https://twitter.com/SherrodBrown,SherrodBrown,11/09/1952,0,White,7,M.a.; Education; Ohio State University; 1981,5,https://www.brown.senate.gov/,https://bioguide.congress.gov/search/bio/B000944,,
|
||||
"Burr, Richard",21,North Carolina,NC,0,3,0.605472891780936,01/03/2001,12/31/2022,22.0054794520548,1,117,51.1,45.3,2016,https://twitter.com/SenatorBurr,SenatorBurr,N/A,N/A,11/30/1955,0,White,6,B.A.; Communications; Wake Forest University; 1978,1,N/A,https://bioguide.congress.gov/search/bio/B001135,,
|
||||
"Cantwell, Maria",22,Washington,WA,1,1,0.216591445478212,01/03/2001,12/31/2022,22.0054794520548,0,117,58.4,41.6,2018,https://twitter.com/SenatorCantwell,SenatorCantwell,N/A,N/A,10/13/1958,1,White,6,B.A.; Public Administration; Miami University of Ohio; 1980,1,https://www.cantwell.senate.gov/,https://bioguide.congress.gov/search/bio/C000127,,
|
||||
"Capito, Shelley Moore",23,West Virginia,WV,0,2,0.61478303011512,01/06/2015,12/31/2022,7.98904109589041,0,117,70.3,27,2020,https://twitter.com/SenCapito,SenCapito,N/A,N/A,11/26/1953,1,White,7,M. Ed.; University of Virginia; 1976,5,https://www.capito.senate.gov/,https://bioguide.congress.gov/search/bio/C001047,,
|
||||
"Cardin, Benjamin L.",24,Maryland,MD,1,1,0.1994990268606,01/04/2007,12/31/2022,16,0,117,64.9,30.3,2018,https://twitter.com/SenatorCardin,SenatorCardin,N/A,N/A,10/05/1943,0,White,8,J.D.; University of Maryland; 1967,2,https://www.cardin.senate.gov/,https://bioguide.congress.gov/search/bio/C000141,,
|
||||
"Carper, Thomas R.",25,Delaware,DE,1,1,0.309479384969288,01/03/2001,12/31/2022,22.0054794520548,0,117,60,37.8,2018,https://twitter.com/SenatorCarper,SenatorCarper,N/A,N/A,01/23/1947,0,White,7,M.B.A.; University of Delaware; 1975,3,https://www.carper.senate.gov/,https://bioguide.congress.gov/search/bio/C000174,,
|
||||
"Casey, Robert P., Jr.",26,Pennsylvania,PA,1,1,0.171897216341815,01/04/2007,12/31/2022,16,0,117,55.7,42.6,2018,https://twitter.com/SenBobCasey,SenBobCasey,https://twitter.com/Bob_Casey,Bob_Casey,04/13/1960,0,White,8,J.D.; Catholic University of America; 1988,2,https://www.casey.senate.gov/,https://bioguide.congress.gov/search/bio/C001070,,
|
||||
"Cassidy, Bill",27,Louisiana,LA,0,2,0.682348710788942,01/06/2015,12/31/2022,7.98904109589041,0,117,59.3,19,2020,https://twitter.com/SenBillCassidy,SenBillCassidy,https://twitter.com/BillCassidy,BillCassidy,09/28/1957,0,White,7,M.D.; Louisiana State University; 1979,6,https://www.cassidy.senate.gov/,https://bioguide.congress.gov/search/bio/C001075,,
|
||||
"Collins, Susan M.",28,Maine,ME,0,2,0.448622425849401,01/07/1997,12/31/2022,25.9972602739726,0,117,51,42.4,2020,https://twitter.com/SenatorCollins,SenatorCollins,N/A,N/A,12/07/1952,1,White,6,Bachelor in Government; St. Lawrence University; 1975,0,https://www.collins.senate.gov/,https://bioguide.congress.gov/search/bio/C001035,,
|
||||
"Coons, Christopher A.",29,Delaware,DE,1,2,0.338422715351401,11/15/2010,12/31/2022,12.1342465753425,0,117,59.4,37.9,2020,https://twitter.com/ChrisCoons,ChrisCoons,N/A,N/A,09/09/1963,0,White,8,J.D.; Yale Law School; 1992,2,https://www.coons.senate.gov/,https://bioguide.congress.gov/search/bio/C001088,,
|
||||
"Cornyn, John",30,Texas,TX,0,2,0.772226738391321,11/30/2002,12/31/2022,20.0986301369863,0,117,53.5,43.9,2020,https://twitter.com/JohnCornyn,JohnCornyn,N/A,N/A,02/02/1952,0,White,8,J.D.; St. Mary’s School of Law; 1977,2,https://www.cornyn.senate.gov/,https://bioguide.congress.gov/search/bio/C001056,,
|
||||
"Cortez Masto, Catherine",31,Nevada,NV,1,3,0.236574567369409,01/03/2017,12/31/2022,5.99452054794521,0,117,47.1,44.7,2016,https://twitter.com/SenCortezMasto,SenCortezMasto,https://twitter.com/CortezMasto,CortezMasto,03/29/1964,1,Hispanic; White,8,J.D.; Gonzaga University School of Law; 1990,2,https://www.cortezmasto.senate.gov/,https://bioguide.congress.gov/search/bio/C001113,,
|
||||
"Cotton, Tom",32,Arkansas,AR,0,2,0.876390364042756,01/06/2015,12/31/2022,7.98904109589041,0,117,66.5,33.5,2020,https://twitter.com/SenTomCotton,SenTomCotton,https://twitter.com/TomCottonAR,TomCottonAR,05/13/1977,0,White,8,J.D.; Harvard University; 2002,2,https://www.cotton.senate.gov/,https://bioguide.congress.gov/search/bio/C001095,,
|
||||
"Cramer, Kevin",33,North Dakota,ND,0,1,0.910896298032277,01/03/2019,12/31/2022,3.99452054794521,0,117,55.5,44.5,2018,https://twitter.com/SenKevinCramer,SenKevinCramer,https://twitter.com/kevincramer,kevincramer,01/21/1961,0,White,7,M.A.; Management; University o fMary; 2003,0,https://www.cramer.senate.gov/,https://bioguide.congress.gov/search/bio/C001096,,
|
||||
"Crapo, Michael",34,Idaho,ID,0,3,0.823331951918519,01/06/1999,12/31/2022,24,0,117,66.1,27.8,2016,https://twitter.com/MikeCrapo,MikeCrapo,N/A,N/A,05/20/1951,0,White,8,J.D.; Harvard University; 1977,2,https://www.crapo.senate.gov/,https://bioguide.congress.gov/search/bio/C000880,,
|
||||
"Cruz, Ted",35,Texas,TX,0,1,0.944056385174951,01/03/2013,12/31/2022,9.9972602739726,0,117,50.9,48.3,2018,https://twitter.com/SenTedCruz,SenTedCruz,https://twitter.com/tedcruz,tedcruz,12/22/1970,0,Hispanic; White,8,J.D.; Harvard University; 1995,2,https://www.cruz.senate.gov/,https://bioguide.congress.gov/search/bio/C001098,,
|
||||
"Daines, Steve",36,Montana,MT,0,2,0.859322244752884,01/06/2015,12/31/2022,7.98904109589041,0,117,55,45,2020,https://twitter.com/SteveDaines,SteveDaines,N/A,N/A,08/20/1962,0,White,6,B.S.; Chemical Engineering; Montana State University; 1984,1,https://www.daines.senate.gov/,https://bioguide.congress.gov/search/bio/D000618,,
|
||||
"Duckworth, Tammy",37,Illinois,IL,1,3,0.0944404184553066,01/03/2017,12/31/2022,5.99452054794521,0,117,54.4,40.2,2016,https://twitter.com/SenDuckworth,SenDuckworth,https://twitter.com/tammyduckworth,tammyduckworth,03/12/1968,1,Asian; White,8,PhD in human services; Capella University School of Public Service Leadership; 2015,3,https://www.duckworth.senate.gov/,https://bioguide.congress.gov/search/bio/D000622,,
|
||||
"Durbin, Richard J.",38,Illinois,IL,1,2,0.0855733771029607,01/07/1997,12/31/2022,25.9972602739726,0,117,54.9,38.9,2020,https://twitter.com/SenatorDurbin,SenatorDurbin,https://twitter.com/DickDurbin,DickDurbin,11/21/1944,0,White,8,J.D.; Georgetown University; 1969,2,https://www.durbin.senate.gov/,https://bioguide.congress.gov/search/bio/D000563,,
|
||||
"Ernst, Joni",39,Iowa,IA,0,2,0.826265400967212,01/06/2015,12/31/2022,7.98904109589041,0,117,51.8,45.2,2020,https://twitter.com/SenJoniErnst,SenJoniErnst,https://twitter.com/joniernst,joniernst,07/01/1970,1,White,7,M.P.A.; Columbus State University; 1995,3,https://www.ernst.senate.gov/,https://bioguide.congress.gov/search/bio/E000295,,
|
||||
"Feinstein, Dianne",40,California,CA,1,1,0.150865658191444,11/10/1992,12/31/2022,30.158904109589,0,117,54.2,45.8,2018,https://twitter.com/SenFeinstein,SenFeinstein,https://twitter.com/DianneFeinstein,DianneFeinstein,06/22/1933,1,White,6,B.A.; History; Stanford University; 1955,0,https://www.feinstein.senate.gov/public/,https://bioguide.congress.gov/search/bio/F000062,,
|
||||
"Fischer, Debra",41,Nebraska,NE,0,1,0.688576408222131,01/03/2013,12/31/2022,9.9972602739726,0,117,57.7,38.6,2018,https://twitter.com/SenatorFischer,SenatorFischer,N/A,N/A,03/01/1951,1,White,6,B.S.; Education; University of Nebraska; 1988,0,https://www.fischer.senate.gov/,https://bioguide.congress.gov/search/bio/F000463,,
|
||||
"Gillibrand, Kirsten E.",42,New York,NY,1,1,0.12072202063417,01/27/2009,12/31/2022,13.9342465753425,0,117,67,33,2018,https://twitter.com/SenGillibrand,SenGillibrand,https://twitter.com/gillibrandny,gillibrandny,12/09/1966,1,White,8,J.D.; University of California; 1991,2,https://www.gillibrand.senate.gov/,https://bioguide.congress.gov/search/bio/G000555,,
|
||||
"Graham, Lindsey",43,South Carolina,SC,0,2,0.619070797359753,01/07/2003,12/31/2022,19.9945205479452,0,117,54.5,44.2,2020,https://twitter.com/LindseyGrahamSC,LindseyGrahamSC,https://twitter.com/grahamblog,grahamblog,07/09/1955,0,White,8,J.D.; University of South Carolina; 1981,2,https://www.lgraham.senate.gov/,https://bioguide.congress.gov/search/bio/G000359 ,,
|
||||
"Grassley, Chuck",44,Iowa,IA,0,3,0.670073592619545,01/05/1981,12/31/2022,42.013698630137,0,117,60.2,35.7,2016,https://twitter.com/ChuckGrassley,ChuckGrassley,N/A,N/A,09/17/1933,0,White,7,M.A.; Political Science; University of Northern Iowa; 1956,0,https://www.grassley.senate.gov/,https://bioguide.congress.gov/search/bio/G000386,,
|
||||
"Hagerty, Bill",45,Tennessee,TN,0,2,0.857410027434407,01/03/2021,12/31/2022,1.99178082191781,0,117,62.2,35.2,2020,https://twitter.com/SenatorHagerty,SenatorHagerty,https://twitter.com/billhagertytn,billhagertytn,08/14/1959,0,White,8,J.D.; Vanderbilt Law School; 1984,0,https://www.hagerty.senate.gov/,https://bioguide.congress.gov/search/bio/H000601,,
|
||||
"Hassan, Margaret Wood",46,New Hampshire,NH,1,3,0.43611907238278,01/03/2017,12/31/2022,5.99452054794521,0,117,48,47.9,2016,https://twitter.com/SenatorHassan,SenatorHassan,https://twitter.com/Maggie_Hassan,Maggie_Hassan,02/27/1958,1,White,8,J.D.; Northeastern University School of law; 1985,11,https://www.hassan.senate.gov/,https://bioguide.congress.gov/search/bio/H001076,,
|
||||
"Hawley, Josh",47,Missouri,MO,0,1,0.864366195602263,01/03/2019,12/31/2022,3.99452054794521,0,117,51.4,45.6,2018,https://twitter.com/HawleyMO,HawleyMO,N/A,N/A,12/31/1979,0,White,8,J.D.; Yale Law School; 2006,2,https://www.hawley.senate.gov/,https://bioguide.congress.gov/search/bio/H001089,,
|
||||
"Heinrich, Martin",48,New Mexico,NM,1,1,0.2007037353465,01/03/2013,12/31/2022,9.9972602739726,0,117,54.1,30.5,2018,https://twitter.com/MartinHeinrich,MartinHeinrich,https://twitter.com/senatorheinrich,senatorheinrich,10/17/1971,0,White,6,B.S.; Mechanical Engineering; University of Missouri; 1995,12,https://www.heinrich.senate.gov/,https://bioguide.congress.gov/search/bio/H001046,,
|
||||
"Hickenlooper, John W.",49,Colorado,CO,1,2,0.335030323955882,01/03/2021,12/31/2022,1.99178082191781,0,117,53.5,44.2,2020,https://twitter.com/SenatorHick,SenatorHick,https://twitter.com/hickenlooper,hickenlooper,02/07/1952,0,White,7,M.A.; Geology; Wesleyan University; 1980,0,https://www.hickenlooper.senate.gov/,https://bioguide.congress.gov/search/bio/H000273,,
|
||||
"Hirono, Mazie K.",50,Hawaii,HI,1,1,0.0715447123166643,01/03/2013,12/31/2022,9.9972602739726,0,117,71.2,28.8,2018,https://twitter.com/maziehirono,maziehirono,https://twitter.com/mazieforhawaii,mazieforhawaii,11/03/1947,1,Asian,8,J.D.; Georgetown University; 1978,0,https://www.hirono.senate.gov/,https://bioguide.congress.gov/search/bio/H001042,,
|
||||
"Hoeven, John",51,North Dakota,ND,0,3,0.815683863264003,01/05/2011,12/31/2022,11.9945205479452,0,117,78.6,17,2016,https://twitter.com/SenJohnHoeven,SenJohnHoeven,N/A,N/A,03/13/1957,0,White,7,M.B.A.; Northwestern University; 1981,12,https://www.hoeven.senate.gov/,https://bioguide.congress.gov/search/bio/H001061,,
|
||||
"Hyde-Smith, Cindy",52,Mississippi,MS,0,2,0.868059764299163,04/09/2018,12/31/2022,4.73150684931507,0,117,54.1,44.1,2020,https://twitter.com/SenHydeSmith,SenHydeSmith,https://twitter.com/cindyhydesmith,cindyhydesmith,05/10/1959,1,White,6,"B.A.; Criminal justice, political science; University of Southern Mississippi; 1981",0,https://www.hydesmith.senate.gov/,https://bioguide.congress.gov/search/bio/H001079 ,,
|
||||
"Inhofe, James",53,Oklahoma,OK,0,2,0.880238318204784,11/17/1994,12/31/2022,28.1397260273973,1,117,62.9,32.8,2020,https://twitter.com/JimInhofe,JimInhofe,N/A,N/A,11/17/1934,0,White,6,B.A.; Economics; University of Tulsa; 1973,0,N/A,https://bioguide.congress.gov/search/bio/I000024 ,,
|
||||
"Johnson, Ron",54,Wisconsin,WI,0,3,0.743401705863958,01/05/2011,12/31/2022,11.9945205479452,0,117,50.2,46.8,2016,https://twitter.com/SenRonJohnson,SenRonJohnson,https://twitter.com/ronjohnsonwi,ronjohnsonwi,04/08/1955,0,White,6,B.S.; Business and Accounting; University of Minnesota; 1977,4,https://www.ronjohnson.senate.gov/,https://bioguide.congress.gov/search/bio/J000293,,
|
||||
"Kaine, Tim",55,Virginia,VA,1,1,0.203600708089391,01/03/2013,12/31/2022,9.9972602739726,0,117,57.1,41.1,2018,https://twitter.com/timkaine,timkaine,N/A,N/A,02/26/1958,0,White,8,J.D.; Harvard University; 1983,11,https://www.kaine.senate.gov/,https://bioguide.congress.gov/search/bio/K000384,,
|
||||
"Kelly, Mark",56,Arizona,AZ,1,3,0.399793347847799,12/02/2020,12/31/2022,2.07945205479452,0,117,51.2,48.8,2020,https://twitter.com/SenMarkKelly,SenMarkKelly,https://twitter.com/CaptMarkKelly,CaptMarkKelly,02/21/1964,0,White,7,M.S.; Aeronautical Engineering; U.S. Naval Postgraduate School,3,https://www.kelly.senate.gov/,https://bioguide.congress.gov/search/bio/K000377,,
|
||||
"Kennedy, John Neely",57,Louisiana,LA,0,3,0.785684351248518,01/03/2017,12/31/2022,5.99452054794521,0,117,60.7,39.3,2016,https://twitter.com/SenJohnKennedy,SenJohnKennedy,https://twitter.com/JohnKennedyLA,JohnKennedyLA,11/21/1951,0,White,8,J.D.; University of Virginia School of LAw; 1977,11,https://www.kennedy.senate.gov/,https://bioguide.congress.gov/search/bio/K000393,,
|
||||
"King, Angus S., Jr.",58,Maine,ME,2,1,0.346033257048853,01/03/2013,12/31/2022,9.9972602739726,0,117,54.3,35.2,2018,https://twitter.com/SenAngusKing,SenAngusKing,N/A,N/A,03/31/1944,0,White,8,J.D.; University of Virginia; 1969,2,https://www.king.senate.gov/,https://bioguide.congress.gov/search/bio/K000383 ,,
|
||||
"Klobuchar, Amy",59,Minnesota,MN,1,1,0.130504324943533,01/04/2007,12/31/2022,16,0,117,60.3,36.2,2018,https://twitter.com/SenAmyKlobuchar,SenAmyKlobuchar,https://twitter.com/amyklobuchar,amyklobuchar,05/25/1960,1,White,8,"J.D.; University of Chicago, 1985",2,https://www.klobuchar.senate.gov/,https://bioguide.congress.gov/search/bio/K000367 ,,
|
||||
"Lankford, James",60,Oklahoma,OK,0,3,0.89992933687588,01/03/2015,12/31/2022,7.9972602739726,0,117,67.7,24.6,2016,https://twitter.com/SenatorLankford,SenatorLankford,https://twitter.com/jameslankford,jameslankford,03/04/1968,0,White,7,M.Div.; Southwestern Theological Baptist Seminary; 1994,5,https://www.lankford.senate.gov/,https://bioguide.congress.gov/search/bio/L000575,,
|
||||
"Leahy, Patrick",61,Vermont,VT,1,3,0.144121081911654,01/14/1975,12/31/2022,47.9945205479452,1,117,61.3,33,2016,https://twitter.com/SenatorLeahy,SenatorLeahy,N/A,N/A,03/31/1940,0,White,8,J.D.; Georgetown University; 1964,2,N/A,https://bioguide.congress.gov/search/bio/L000174,,
|
||||
"Lee, Mike",62,Utah,UT,0,3,0.753748787807473,01/05/2011,12/31/2022,11.9945205479452,0,117,68,27.4,2016,https://twitter.com/SenMikeLee,SenMikeLee,https://twitter.com/BasedMikeLee,BasedMikeLee,06/04/1971,0,White,8,J.D.; Brigham Young university; 1997,2,https://www.lee.senate.gov/,https://bioguide.congress.gov/search/bio/L000577,,
|
||||
"Luján, Ben Ray",63,New Mexico,NM,1,2,0.174860888138848,01/03/2021,12/31/2022,1.99178082191781,0,117,51.7,45.6,2020,https://twitter.com/SenatorLujan,SenatorLujan,https://twitter.com/benraylujan,benraylujan,06/07/1972,0,Hispanic,6,B.B.A.; New Mexico Highlands University; 2007,0,https://www.lujan.senate.gov/,https://bioguide.congress.gov/search/bio/L000570 ,,
|
||||
"Lummis, Cynthia M.",64,Wyoming,WY,0,2,0.893292958108508,01/03/2021,12/31/2022,1.99178082191781,0,117,73.1,26.9,2020,https://twitter.com/SenLummis,SenLummis,https://twitter.com/CynthiaMLummis,CynthiaMLummis,09/10/1954,1,White,8,"J.D.; University of Wyoming College of Law, Laramie, Wyo.; 1985",11,https://www.lummis.senate.gov/,https://bioguide.congress.gov/search/bio/L000571 ,,
|
||||
"Manchin, Joe, III",65,West Virginia,WV,1,1,0.446686774398077,11/15/2010,12/31/2022,12.1342465753425,0,117,49.6,46.3,2018,https://twitter.com/Sen_JoeManchin,Sen_JoeManchin,https://twitter.com/JoeManchinWV,JoeManchinWV,08/24/1947,0,White,6,B.A.; Business Administration; West Virginia University; 1970,12,https://www.manchin.senate.gov/,https://bioguide.congress.gov/search/bio/M001183 ,,
|
||||
"Markey, Edward J.",66,Massachusetts,MA,1,2,0.0139659683705929,07/16/2013,12/31/2022,9.46575342465753,0,117,66.2,33,2020,https://twitter.com/SenMarkey,SenMarkey,https://twitter.com/edmarkey,edmarkey,07/11/1946,0,White,8,J.D.; Boston College Law School; 1972,11,https://www.markey.senate.gov/,https://bioguide.congress.gov/search/bio/M000133,,
|
||||
"Marshall, Roger",67,Kansas,KS,0,2,0.882124792228652,01/03/2021,12/31/2022,1.99178082191781,0,117,53.2,41.8,2020,https://twitter.com/SenatorMarshall,SenatorMarshall,https://twitter.com/RogerMarshallMD,RogerMarshallMD,08/09/1960,0,White,7,M.D.; University of Kansas School of Medicine; 1987,6,https://www.marshall.senate.gov/,https://bioguide.congress.gov/search/bio/M001198,,
|
||||
"McConnell, Mitch",68,Kentucky,KY,0,2,0.599687533584357,01/03/1985,12/31/2022,38.0164383561644,0,117,57.8,38.2,2020,https://twitter.com/LeaderMcConnell,LeaderMcConnell,N/A,N/A,02/20/1942,0,White,8,J.D.; Kentucky Law School; 1967,11,https://www.mcconnell.senate.gov/,https://bioguide.congress.gov/search/bio/M000355,,
|
||||
"Menendez, Robert",69,New Jersey,NJ,1,1,0.191515157461704,01/18/2006,12/31/2022,16.9616438356164,0,117,54,42.8,2018,https://twitter.com/SenatorMenendez,SenatorMenendez,N/A,N/A,01/01/1954,0,Hispanic,8,J.D.; Rutgers university of Law; 1979,11,https://www.menendez.senate.gov/,https://bioguide.congress.gov/search/bio/M000639,,
|
||||
"Merkley, Jeff",70,Oregon,OR,1,2,0.0355414098997263,01/06/2009,12/31/2022,13.9917808219178,0,117,56.9,39.3,2020,https://twitter.com/SenJeffMerkley,SenJeffMerkley,https://twitter.com/jeffmerkley,jeffmerkley,10/24/1956,0,White,7,M.P.A.; Princeton University; 1982,0,https://www.merkley.senate.gov/,https://bioguide.congress.gov/search/bio/M001176,,
|
||||
"Moran, Jerry",71,Kansas,KS,0,3,0.716270292467902,01/05/2011,12/31/2022,11.9945205479452,0,117,62.4,32.1,2016,https://twitter.com/JerryMoran,JerryMoran,N/A,N/A,05/29/1954,0,White,8,J.D.; Kansas University School of Law; 1981,11,https://www.moran.senate.gov/public/,https://bioguide.congress.gov/search/bio/M000934 ,,
|
||||
"Murkowski, Lisa",72,Alaska,AK,0,3,0.473296745648617,12/20/2002,12/31/2022,20.0438356164384,0,117,44.3,29.5,2016,https://twitter.com/lisamurkowski,lisamurkowski,https://twitter.com/lisaforsenate,lisaforsenate,05/22/1957,1,White,8,J.D.; Willamette College of Law; 1985,2,https://www.murkowski.senate.gov/,https://bioguide.congress.gov/search/bio/M001153,,
|
||||
"Murphy, Christopher",73,Connecticut,CT,1,1,0.152635018959264,01/03/2013,12/31/2022,9.9972602739726,0,117,59.5,39.4,2018,https://twitter.com/ChrisMurphyCT,ChrisMurphyCT,N/A,N/A,08/03/1973,0,White,8,J.D.; University of Connecticut; 2002,11,https://www.murphy.senate.gov/,https://bioguide.congress.gov/search/bio/M001169,,
|
||||
"Murray, Patty",74,Washington,WA,1,3,0.142703588817088,01/05/1993,12/31/2022,30.0054794520548,0,117,59.1,40.9,2016,https://twitter.com/PattyMurray,PattyMurray,https://twitter.com/murraycampaign,murraycampaign,10/11/1950,1,White,6,B.A.; Physical Education; Washington State University; 1972,5,https://www.murray.senate.gov/,https://bioguide.congress.gov/search/bio/M001111,,
|
||||
"Ossoff, Jon",75,Georgia,GA,1,2,0.303405364928085,01/20/2021,12/31/2022,1.94520547945205,0,117,50.6,49.4,2020,https://twitter.com/SenOssoff,SenOssoff,https://twitter.com/ossoff,ossoff,02/16/1987,0,White,7,M.S.; International Politicla Economy; London School of Economics; 2013,7,https://www.ossoff.senate.gov/,https://bioguide.congress.gov/search/bio/O000174,,
|
||||
"Padilla, Alex",76,California,CA,1,3,0.0200324383981554,01/20/2021,12/31/2022,1.94520547945205,0,117,N/A,N/A,*,https://twitter.com/SenAlexPadilla,SenAlexPadilla,https://twitter.com/AlexPadilla4CA,AlexPadilla4CA,03/22/1973,0,Hispanic,6,B.S.; Mechanical Engineering; MIT; 1994,9,https://www.padilla.senate.gov/,https://bioguide.congress.gov/search/bio/P000145,appointed in 2020 to replace Kamala Harris ,
|
||||
"Paul, Rand",77,Kentucky,KY,0,3,0.684883322748808,01/05/2011,12/31/2022,11.9945205479452,0,117,57.3,42.7,2016,https://twitter.com/senrandpaul,senrandpaul,https://twitter.com/RandPaul,RandPaul,01/07/1963,0,White,7,M.D.; Duke University; 1988,6,https://www.paul.senate.gov/,https://bioguide.congress.gov/search/bio/P000603,,
|
||||
"Peters, Gary C.",78,Michigan,MI,1,2,0.355796587683312,01/06/2015,12/31/2022,7.98904109589041,0,117,49.9,48.2,2020,https://twitter.com/SenGaryPeters,SenGaryPeters,https://twitter.com/garypeters,garypeters,12/01/1958,0,White,8,J.D.; Wayne State University; 1989,2,https://www.peters.senate.gov/,https://bioguide.congress.gov/search/bio/P000595,,
|
||||
"Portman, Robert",79,Ohio,OH,0,3,0.548120690430407,01/05/2011,12/31/2022,11.9945205479452,1,117,58.3,36.9,2016,https://twitter.com/senrobportman,senrobportman,N/A,N/A,12/19/1955,0,White,8,J.D.; University of Michigan; 1985,2,N/A,https://bioguide.congress.gov/search/bio/P000449,,
|
||||
"Reed, John F.",80,Rhode Island,RI,1,2,0.145861826443275,01/07/1997,12/31/2022,25.9972602739726,0,117,66.6,33.4,2020,https://twitter.com/SenJackReed,SenJackReed,N/A,N/A,11/12/1949,0,White,8,J.D.; Harvard University; 1982,2,https://www.reed.senate.gov/,https://bioguide.congress.gov/search/bio/R000122,,
|
||||
"Risch, James E.",81,Idaho,ID,0,2,0.82910906209038,01/06/2009,12/31/2022,13.9917808219178,0,117,62.6,33.2,2020,https://twitter.com/SenatorRisch,SenatorRisch,N/A,N/A,05/03/1943,0,White,8,J.D.; University of Idaho; 1968,2,https://www.risch.senate.gov/,https://bioguide.congress.gov/search/bio/R000584,,
|
||||
"Romney, Mitt",82,Utah,UT,0,1,0.596688837978771,01/03/2019,12/31/2022,3.99452054794521,0,117,62.6,30.9,2018,https://twitter.com/SenatorRomney,SenatorRomney,https://twitter.com/mittromney,mittromney,03/12/1947,0,White,7,M.B.A.; Harvard Business School; 1975,1,https://www.romney.senate.gov/,https://bioguide.congress.gov/search/bio/R000615,,
|
||||
"Rosen, Jacky",83,Nevada,NV,1,1,0.308548351377894,01/03/2019,12/31/2022,3.99452054794521,0,117,50.4,45.4,2018,https://twitter.com/SenJackyRosen,SenJackyRosen,https://twitter.com/RosenforNevada,RosenforNevada,08/02/1957,1,White,6,B.A.; Psychology; University of Minnesota; 1979,1,https://www.rosen.senate.gov/,https://bioguide.congress.gov/search/bio/R000608,,
|
||||
"Rounds, Mike",84,South Dakota,SD,0,2,0.784008560585577,01/06/2015,12/31/2022,7.98904109589041,0,117,65.7,34.3,2020,https://twitter.com/SenatorRounds,SenatorRounds,N/A,N/A,10/24/1954,0,White,6,B.S.; Political Science; South Dakota State University; 1977,1,https://www.rounds.senate.gov/,https://bioguide.congress.gov/search/bio/R000605,,
|
||||
"Rubio, Marco",85,Florida,FL,0,3,0.831181764071725,01/05/2011,12/31/2022,11.9945205479452,0,117,52,44.3,2016,https://twitter.com/senmarcorubio,senmarcorubio,https://twitter.com/marcorubio,marcorubio,05/28/1971,0,Hispanic,8,J.D.; University of Miami; 1996,2,https://www.rubio.senate.gov/,https://bioguide.congress.gov/search/bio/R000595,,
|
||||
"Sanders, Bernard",86,Vermont,VT,2,1,0,01/04/2007,12/31/2022,16,0,117,67.4,27.5,2018,https://twitter.com/SenSanders,SenSanders,https://twitter.com/BernieSanders,BernieSanders,09/08/1941,0,White,6,B.A.; Political Science; University of Chicago; 1964,0,https://www.sanders.senate.gov/,https://bioguide.congress.gov/search/bio/S000033,,
|
||||
"Sasse, Benjamin",87,Nebraska,NE,0,2,0.684229649213868,01/06/2015,12/31/2022,7.98904109589041,1,117,62.7,24.4,2020,https://twitter.com/sensasse,sensasse,https://twitter.com/BenSasse,BenSasse,02/22/1972,0,White,8,PhD in History; Yale University; 2004,5,N/A,https://bioguide.congress.gov/search/bio/S001197,,
|
||||
"Schatz, Brian",88,Hawaii ,HI,1,3,0.213250458593456,12/27/2012,12/31/2022,10.0164383561644,0,117,73.6,22.2,2016,https://twitter.com/brianschatz,brianschatz,https://twitter.com/SenBrianSchatz,SenBrianSchatz,10/20/1972,0,White,6,B.A.; Philosophy; Pomona College; 1994,5,https://www.schatz.senate.gov/,https://bioguide.congress.gov/search/bio/S001194,,
|
||||
"Schumer, Charles E.",89,New York,NY,1,3,0.239789022209428,01/06/1999,12/31/2022,24,0,117,70.4,27.4,2016,https://twitter.com/SenSchumer,SenSchumer,https://twitter.com/chuckschumer,chuckschumer,11/23/1950,0,White,8,J.D.; Harvard University; 1974,2,https://www.schumer.senate.gov/,https://bioguide.congress.gov/search/bio/S000148 ,,
|
||||
"Scott, Rick",90,Florida,FL,0,1,1,01/08/2019,12/31/2022,3.98082191780822,0,117,50.1,49.9,2018,https://twitter.com/SenRickScott,SenRickScott,https://twitter.com/scottforflorida,scottforflorida,12/01/1952,0,White,8,J.D.; Southern Methodist University; 1978,2,https://www.rickscott.senate.gov/,https://bioguide.congress.gov/search/bio/S001217,,
|
||||
"Scott, Tim",91,South Carolina,SC,0,3,0.781356077518849,01/03/2013,12/31/2022,9.9972602739726,0,117,60.6,37,2016,https://twitter.com/SenatorTimScott,SenatorTimScott,https://twitter.com/votetimscott,votetimscott,09/19/1965,0,African-American,6,B.S.; Political Science; Charleston Southern University; 1988 ,1,https://www.scott.senate.gov/,https://bioguide.congress.gov/search/bio/S001184,,
|
||||
"Shaheen, Jeanne",92,New Hampshire,NH,1,2,0.2925665319541,01/06/2009,12/31/2022,13.9917808219178,0,117,56.6,41,2020,https://twitter.com/SenatorShaheen,SenatorShaheen,https://twitter.com/JeanneShaheen,JeanneShaheen,01/28/1947,1,White,7,M.S.S.; University of Mississippi; 1973,5,https://www.shaheen.senate.gov/,https://bioguide.congress.gov/search/bio/S001181,,
|
||||
"Shelby, Richard",93,Alabama,AL,0,3,0.577739000839365,01/06/1987,12/31/2022,36.0082191780822,1,117,64.2,35.8,2016,https://twitter.com/SenShelby,SenShelby,N/A,N/A,05/06/1934,0,White,6,LL.B.; University of Alabama; 1963,2,N/A,https://bioguide.congress.gov/search/bio/S000320,,
|
||||
"Sinema, Kyrsten",94,Arizona,AZ,2,1,0.500967034663567,01/03/2019,12/31/2022,3.99452054794521,0,117,50,47.6,2018,https://twitter.com/SenatorSinema,SenatorSinema,https://twitter.com/kyrstensinema,kyrstensinema,07/12/1976,1,White,8,PhD in Justice Studies; Arizona State University; 2012,2,https://www.sinema.senate.gov/,https://bioguide.congress.gov/search/bio/S001191,,
|
||||
"Smith, Tina",95,Minnesota,MN,1,2,0.0756533259297989,01/03/2018,12/31/2022,4.99452054794521,0,117,48.8,43.5,2020,https://twitter.com/SenTinaSmith,SenTinaSmith,https://twitter.com/TinaSmithMN,TinaSmithMN,03/04/1958,1,White,7,M.B.A. Dartmouth College; 1984,1,https://www.smith.senate.gov/,https://bioguide.congress.gov/search/bio/S001203,,
|
||||
"Stabenow, Debbie",96,Michigan,MI,1,1,0.221949395648287,01/03/2001,12/31/2022,22.0054794520548,0,117,52.3,45.8,2018,https://twitter.com/SenStabenow,SenStabenow,https://twitter.com/stabenow,stabenow,04/29/1950,1,White,7,M.S.W.; Michigan State University; 1975,5,https://www.stabenow.senate.gov/,https://bioguide.congress.gov/search/bio/S000770,,
|
||||
"Sullivan, Dan",97,Alaska,AK,0,2,0.652100683642255,01/06/2015,12/31/2022,7.98904109589041,0,117,53.9,41.2,2020,https://twitter.com/SenDanSullivan,SenDanSullivan,N/A,N/A,11/13/1964,0,White,8,J.D.; Georgetown University; 1993,2,https://www.sullivan.senate.gov/,https://bioguide.congress.gov/search/bio/S001198,,
|
||||
"Tester, Jon",98,Montana,MT,1,1,0.377646486433112,01/04/2007,12/31/2022,16,0,117,50.3,46.8,2018,https://twitter.com/SenatorTester,SenatorTester,https://twitter.com/jontester,jontester,08/21/1956,0,White,6,B.A.; Music; University of Providence; 1978,10,https://www.tester.senate.gov/,https://bioguide.congress.gov/search/bio/T000464 ,,
|
||||
"Thune, John",99,South Dakota,SD,0,3,0.795060855902239,01/04/2005,12/31/2022,18,0,117,71.8,28.2,2016,https://twitter.com/SenJohnThune,SenJohnThune,https://twitter.com/johnthune,johnthune,01/07/1961,0,White,7,M.B.A.; University of South Dakota; 1984,1,https://www.thune.senate.gov/,https://bioguide.congress.gov/search/bio/T000250 ,,
|
||||
"Tillis, Thom",100,North Carolina,NC,0,2,0.819146177750934,01/06/2015,12/31/2022,7.98904109589041,0,117,48.7,46.9,2020,https://twitter.com/SenThomTillis,SenThomTillis,https://twitter.com/ThomTillis,ThomTillis,08/30/1960,0,White,6,B.S.; Technology Management; University of Maryland; 1996,1,https://www.tillis.senate.gov/,https://bioguide.congress.gov/search/bio/T000476 ,,
|
||||
"Toomey, Patrick",101,Pennsylvania,PA,0,3,0.607637714921737,01/05/2011,12/31/2022,11.9945205479452,1,117,48.9,47.2,2016,https://twitter.com/SenToomey,SenToomey,https://twitter.com/pattoomey,pattoomey,11/17/1961,0,White,6,A.B.; Government; Harvard College; 1984,1,N/A,https://bioguide.congress.gov/search/bio/T000461 ,,
|
||||
"Tuberville, Tommy",102,Alabama,AL,0,2,0.808701355452043,01/03/2021,12/31/2022,1.99178082191781,0,117,60.1,39.7,2020,https://twitter.com/SenTuberville,SenTuberville,https://twitter.com/TTuberville,TTuberville,09/18/1954,0,White,6,"B.S., physical education, Southern Arkansas University, 1976",5,https://www.tuberville.senate.gov/,https://bioguide.congress.gov/search/bio/T000278 ,,
|
||||
"Van Hollen, Chris",103,Maryland,MD,1,3,0.117646768842011,01/03/2017,12/31/2022,5.99452054794521,0,117,60.4,36.4,2016,https://twitter.com/ChrisVanHollen,ChrisVanHollen,N/A,N/A,01/10/1959,0,White,8,J.D.; Georgetown university; 1990,2,https://www.vanhollen.senate.gov/,https://bioguide.congress.gov/search/bio/V000128,,
|
||||
"Warner, Mark R.",104,Virginia,VA,1,2,0.33022168507113,01/06/2009,12/31/2022,13.9917808219178,0,117,56,44,2020,https://twitter.com/SenatorWarner,SenatorWarner,https://twitter.com/MarkWarner,MarkWarner,12/15/1954,0,White,8,J.D.; Harvard Law School; 1980,1,https://www.warner.senate.gov/,https://bioguide.congress.gov/search/bio/W000805 ,,
|
||||
"Warnock, Raphael G.",105,Georgia,GA,1,3,0.464158242867696,01/20/2021,12/31/2022,1.94520547945205,0,117,51,49,2020,https://twitter.com/SenatorWarnock,SenatorWarnock,https://twitter.com/ReverendWarnock,ReverendWarnock,07/23/1969,0,African-American,8,PhD in Philosophy; Union Theological Seminary; ,8,https://www.warnock.senate.gov/,https://bioguide.congress.gov/search/bio/W000790,,
|
||||
"Warren, Elizabeth",106,Massachusetts,MA,1,1,0.0583875007437665,01/03/2013,12/31/2022,9.9972602739726,0,117,60.4,36.2,2018,https://twitter.com/SenWarren,SenWarren,https://twitter.com/ewarren,ewarren,06/22/1949,1,White,8,J.D.; Rutgers University; 1976,2,https://www.warren.senate.gov/,https://bioguide.congress.gov/search/bio/W000817 ,,
|
||||
"Whitehouse, Sheldon",107,Rhode Island,RI,1,1,0.124737669119195,01/04/2007,12/31/2022,16,0,117,61.6,38.4,2018,https://twitter.com/SenWhitehouse,SenWhitehouse,N/A,N/A,10/20/1955,0,White,8,J.D.; University of Virginia; 1982,2,https://www.whitehouse.senate.gov/,https://bioguide.congress.gov/search/bio/W000802,,
|
||||
"Wicker, Roger F.",108,Mississippi,MS,0,1,0.763788502839721,12/31/2007,12/31/2022,15.0109589041096,0,117,58.5,39.5,2018,https://twitter.com/SenatorWicker,SenatorWicker,https://twitter.com/RogerWicker,RogerWicker,07/05/1951,0,White,8,J.D.; University of Mississippi; 1975,2,https://www.wicker.senate.gov/,https://bioguide.congress.gov/search/bio/W000437,,
|
||||
"Wyden, Ron",109,Oregon,OR,1,3,0.0591413132623803,02/05/1996,12/31/2022,26.9205479452055,0,117,56.7,33.6,2016,https://twitter.com/RonWyden,RonWyden,N/A,N/A,05/03/1949,0,White,8,J.D.; University of Oregon; 1974,2,https://www.wyden.senate.gov/,https://bioguide.congress.gov/search/bio/W000779,,
|
||||
"Young, Todd",110,Indiana,IN,0,3,0.677696674158218,01/05/2011,12/31/2022,11.9945205479452,1,117,52.1,42.4,2016,https://twitter.com/SenToddYoung,SenToddYoung,https://twitter.com/ToddYoungIN,ToddYoungIN,08/24/1972,0,White,8,J.D.; Robert H. McKinney; 2006,2,https://www.young.senate.gov/,https://bioguide.congress.gov/search/bio/Y000064,,
|
|
8
data/OUT/.gitignore
vendored
Normal file
8
data/OUT/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
/ALL-SENATORS-TWEETS.csv
|
||||
/Pretest-Prep.csv
|
||||
/Pretest-Results.csv
|
||||
/Pretest-SENATORS-TWEETS.csv
|
||||
/SenatorsTweets-Final.csv
|
||||
/SenatorsTweets-OnlyCov.csv
|
||||
/Tweets-Classified-Prep.csv
|
||||
/Tweets-Stub.csv
|
0
data/OUT/.gitkeep
Normal file
0
data/OUT/.gitkeep
Normal file
3
data/OUT/graphs/.gitignore
vendored
Normal file
3
data/OUT/graphs/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/Timeline.png
|
||||
/Wordcloud-All.png
|
||||
/Wordcloud-Cov.png
|
89
funs/CleanTweets.py
Normal file
89
funs/CleanTweets.py
Normal file
@ -0,0 +1,89 @@
|
||||
import re
|
||||
import string
|
||||
|
||||
def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
|
||||
preprocessed_text = []
|
||||
for t in text.split():
|
||||
if len(t) > 1:
|
||||
t = '@user' if t[0] == '@' and t.count('@') == 1 else t
|
||||
t = 'http' if t.startswith('http') else t
|
||||
preprocessed_text.append(t)
|
||||
return ' '.join(preprocessed_text)
|
||||
|
||||
def remove_URL(text):
|
||||
try:
|
||||
url = re.compile(r'https?://\S+|www\.\S+')
|
||||
except: print(text)
|
||||
return url.sub(r'', text)
|
||||
|
||||
def remove_emoji(text):
|
||||
emoji_pattern = re.compile(
|
||||
'['
|
||||
u'\U0001F600-\U0001F64F' # emoticons
|
||||
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
||||
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
||||
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
||||
u'\U00002702-\U000027B0'
|
||||
u'\U000024C2-\U0001F251'
|
||||
']+',
|
||||
flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r'', text)
|
||||
|
||||
def remove_html(text):
|
||||
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
||||
return re.sub(html, '', text)
|
||||
|
||||
def remove_punct(text):
|
||||
table = str.maketrans('', '', string.punctuation)
|
||||
return text.translate(table)
|
||||
|
||||
def remove_nonascii(text):
|
||||
return re.sub(r'[^\x00-\x7F]+', '', text)
|
||||
|
||||
def remove_spec(text):
|
||||
text = re.sub(r'&?', r'and', text)
|
||||
text = re.sub(r'<', r'<', text)
|
||||
return re.sub(r'>', r'>', text)
|
||||
|
||||
def remove_spaces(text): # also new line chars and to lower case
|
||||
text = re.sub(r'<', r'<', text)
|
||||
text = " ".join(text.splitlines()) # remove newline characters
|
||||
text = text.lower()
|
||||
text = text.strip()
|
||||
return re.sub(r'\s{2,}', ' ', text)
|
||||
|
||||
def remove_retw(text):
|
||||
text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
|
||||
return re.sub(r'@[\S]+', '', text)
|
||||
|
||||
def preprocess_text(text):
|
||||
text = remove_URL(text)
|
||||
text = remove_emoji(text)
|
||||
text = remove_html(text)
|
||||
text = remove_punct(text)
|
||||
text = remove_nonascii(text)
|
||||
text = remove_spec(text)
|
||||
text = remove_spaces(text)
|
||||
text = remove_retw(text)
|
||||
return text
|
||||
|
||||
def preprocess_text_series(series):
|
||||
series = series.apply(remove_URL)
|
||||
series = series.apply(remove_emoji)
|
||||
series = series.apply(remove_html)
|
||||
series = series.apply(remove_punct)
|
||||
series = series.apply(remove_nonascii)
|
||||
series = series.apply(remove_spec)
|
||||
series = series.apply(remove_spaces)
|
||||
series = series.apply(remove_retw)
|
||||
return series
|
||||
|
||||
# Check all functions:
|
||||
input_text = """
|
||||
Check out this amazing website: https://www.example.com! 😃
|
||||
<html>This is an HTML tag.</html>
|
||||
RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
|
||||
This is a test text with lots of punctuations!!! Can't wait to see more...
|
||||
"""
|
||||
processed_text = preprocess_text(input_text)
|
||||
# print(processed_text)
|
@ -63,4 +63,55 @@ def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,
|
||||
# save short csv
|
||||
tweet_df.to_csv(csv_path, encoding='utf-8')
|
||||
# sleep 1 second to not get blocked because of excessive requests
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
def getHandles(di):
|
||||
"""grabs accounts from senators-raw.csv
|
||||
|
||||
Args:
|
||||
di (str): path to senators-raw.csv
|
||||
|
||||
Returns:
|
||||
list: list containing str of senator account handles
|
||||
"""
|
||||
accounts = pd.read_csv(f"{di}senators-raw.csv")["twitter_handle"].tolist()
|
||||
alt_accounts = pd.read_csv(f"{di}senators-raw.csv")["alt_handle"].tolist()
|
||||
alt_accounts = [x for x in alt_accounts if str(x) != 'nan'] # remove empty alt_accounts fields
|
||||
accounts.extend(alt_accounts)
|
||||
return accounts
|
||||
|
||||
def printHandles(accounts):
|
||||
"""returns string with all accounts in a readable way.
|
||||
|
||||
Args:
|
||||
accounts (list): list of str with handles
|
||||
|
||||
Returns:
|
||||
str: containing text that can be written to txtfile
|
||||
"""
|
||||
txt = ["Accounts to be scraped:\n"]
|
||||
for i, acc in enumerate(accounts): # print 5 accounts per line
|
||||
txt.append(f"{acc:^17}") # twitter handle max length = 15 chars
|
||||
if i % 5 == 4:
|
||||
txt.append(" \n")
|
||||
txt.append(f"\n{i} accounts in total.")
|
||||
return ''.join(txt)
|
||||
|
||||
def scrapeUsers(handle, userDFColumns, maxTweets=1):
|
||||
currentTime = datetime.now()
|
||||
userList = []
|
||||
print(f'{currentTime:<30} Fetching: {handle:>15}')
|
||||
query = f'from:{handle}'
|
||||
|
||||
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
|
||||
if i > maxTweets:
|
||||
break
|
||||
# Get user data and append to singleUserList
|
||||
userList = []
|
||||
for col in userDFColumns:
|
||||
singleUser = eval(f'tweet.user.{col}')
|
||||
userList.append(singleUser)
|
||||
|
||||
# Create dataframe using userList and userDFColumns
|
||||
#df = pd.DataFrame(userList, columns=userDFColumns)
|
||||
return userList
|
0
log/.gitkeep
Normal file
0
log/.gitkeep
Normal file
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
|
||||
2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
|
||||
3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
|
||||
4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
|
||||
5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
|
||||
6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
|
||||
2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
|
||||
3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
|
||||
4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
|
||||
5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
|
||||
6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
|
||||
2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
|
||||
3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
|
||||
4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
|
||||
5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
|
||||
6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
|
|
@ -0,0 +1,7 @@
|
||||
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||
1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
|
||||
2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
|
||||
3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
|
||||
4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
|
||||
5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
|
||||
6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
|
|
135
preTestClassification.py
Normal file
135
preTestClassification.py
Normal file
@ -0,0 +1,135 @@
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
|
||||
#%%
|
||||
# prepare
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "Tweets-Stub.csv"
|
||||
|
||||
# Name of pretest files
|
||||
preTestIDsFake = "pretest-tweets_fake.txt"
|
||||
preTestIDsNot = "pretest-tweets_not_fake.txt"
|
||||
|
||||
# Name of pretest datafile
|
||||
senCSVPretest = "Pretest.csv"
|
||||
senCSVPretestPrep = "Pretest-Prep.csv"
|
||||
senCSVPretestResult = "Pretest-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc
|
||||
senCSVcPretestPath = wd + ud + senCSVPretest
|
||||
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
|
||||
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
||||
preTestIDsFakePath = wd + di + preTestIDsFake
|
||||
preTestIDsNotPath = wd + di + preTestIDsNot
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# List of IDs to select
|
||||
# Read the IDs from a file
|
||||
preTestIDsFakeL = []
|
||||
preTestIDsNotL = []
|
||||
with open(preTestIDsFakePath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsFakeL.append(tid)
|
||||
with open(preTestIDsNotPath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsNotL.append(tid)
|
||||
|
||||
# Select rows based on the IDs
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
#%%
|
||||
# Create pretest dataframe
|
||||
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
|
||||
dfPreTest['fake'] = True
|
||||
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
|
||||
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
|
||||
|
||||
# %%
|
||||
results = pipe(KeyDataset(dataset, "text"))
|
||||
# %%
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfPreTest['output_label'] = output_labels
|
||||
dfPreTest['output_score'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
55
profiler.py
Normal file
55
profiler.py
Normal file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Aug 8 14:49:02 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import pandas_profiling as pp
|
||||
import numpy
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "SenatorsTweets-Final"
|
||||
senCSVcCov = "SenatorsTweets-OnlyCov"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc + ".csv"
|
||||
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
|
||||
# forming dataframe and printing
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
# forming ProfileReport and save
|
||||
# as output.html file
|
||||
profileAll = pp.ProfileReport(df, minimal=True)
|
||||
profileAll.to_file("data/OUT/profiles/AllTweets.html")
|
||||
|
||||
df = pd.read_csv(senCSVcCovPath, dtype=(object))
|
||||
|
||||
profileAll = pp.ProfileReport(df, minimal=True)
|
||||
profileAll.to_file("data/OUT/profiles/CovTweets.html")
|
35
repairmystupidity.py
Normal file
35
repairmystupidity.py
Normal file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Aug 14 20:47:22 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
|
||||
richtig = wd + ud + "SenatorsTweets-Training.csv"
|
||||
correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
|
||||
|
||||
# don't change this one
|
||||
falsch = pd.read_csv(falsch, dtype=(object), sep=";")
|
||||
richtig = pd.read_csv(richtig, dtype=(object))
|
||||
|
||||
df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
|
||||
df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
|
||||
df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
|
||||
df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
|
||||
df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
|
||||
|
||||
df.to_csv(correct, encoding='utf-8', sep=";")
|
613
trainFake.py
Normal file
613
trainFake.py
Normal file
@ -0,0 +1,613 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Aug 12 12:25:18 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
#from datasets import load_dataset
|
||||
#from transformers import Trainer
|
||||
#from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Training CSV dataset
|
||||
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
|
||||
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
|
||||
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
|
||||
statsTrainingTopicClass = "statsTopicClassification-"
|
||||
|
||||
# don't change this one
|
||||
twtCSVPath = wd + ud + twtCSV + ".csv"
|
||||
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
|
||||
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
|
||||
|
||||
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
|
||||
|
||||
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
|
||||
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
|
||||
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
|
||||
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
|
||||
|
||||
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
|
||||
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
|
||||
|
||||
seed = 12355
|
||||
|
||||
# Model paths
|
||||
modCovClassPath = wd + "models/CovClass/"
|
||||
modFakeClassPath = wd + "models/FakeClass/"
|
||||
|
||||
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
|
||||
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
|
||||
#model_name = "cardiffnlp/tweet-topic-latest-multi"
|
||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
|
||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||
|
||||
# More models for fake detection:
|
||||
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
max_length = 64 # max token sentence length
|
||||
|
||||
#%%
|
||||
# Create training and testing dataset
|
||||
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
|
||||
|
||||
#dfTest = dfTest[:-900] # remove last 800 rows
|
||||
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
|
||||
|
||||
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
|
||||
|
||||
dfTest.drop(columns=['rawContent'], inplace=True)
|
||||
|
||||
# Only keep tweets that are longer than 3 words
|
||||
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
|
||||
dfTest['tweet_proc_length'].value_counts()
|
||||
dfTest = dfTest[dfTest['tweet_proc_length']>3]
|
||||
dfTest = dfTest.drop_duplicates(subset=['text'])
|
||||
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
|
||||
|
||||
# Create datasets for each classification
|
||||
dfCovClass = dfTest
|
||||
dfFakeClass = dfTest
|
||||
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
|
||||
|
||||
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
|
||||
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
|
||||
|
||||
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
|
||||
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
|
||||
|
||||
#%%
|
||||
# Tokenize tweets
|
||||
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
|
||||
dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
|
||||
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
|
||||
def encode_labels(label):
|
||||
if label == 'Covid':
|
||||
return 1
|
||||
elif label == 'NonCovid':
|
||||
return 0
|
||||
elif label == 'False':
|
||||
return 1
|
||||
elif label == 'True':
|
||||
return 0
|
||||
return 0
|
||||
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
|
||||
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
|
||||
#dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
|
||||
|
||||
# get n of classes
|
||||
print("# of Non-Covid tweets (coded 0):")
|
||||
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
|
||||
|
||||
print("# of Fake-news tweets (coded 1):")
|
||||
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
|
||||
# create disproportionate sample - 50/50 of both
|
||||
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
|
||||
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
|
||||
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
|
||||
|
||||
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
|
||||
dfCovClassab.reset_index(inplace=True)
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
|
||||
'''
|
||||
|
||||
# create training and validation samples
|
||||
dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
|
||||
|
||||
# reset index and drop unnecessary columns
|
||||
dfFakeClass_train.reset_index(drop=True, inplace=True)
|
||||
dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
dfFakeClass_test.reset_index(drop=True, inplace=True)
|
||||
dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
# save dfs as csvs and tsvs, for training and validation
|
||||
# covid classification datafiles
|
||||
# rows 0-41 = noncovid, 42-81 covid, therfore:
|
||||
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
|
||||
#dfCovClass.reset_index(inplace=True, drop=True)
|
||||
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
# fake news classification datafiles
|
||||
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
#%%
|
||||
# Prepare trainer
|
||||
#from transformers import TrainingArguments
|
||||
|
||||
#training_args = TrainingArguments(
|
||||
# report_to = 'wandb',
|
||||
# output_dir=wd+'results', # output directory/
|
||||
# overwrite_output_dir = True,
|
||||
# num_train_epochs=6, # total number of training epochs
|
||||
# per_device_train_batch_size=8, # batch size per device during training
|
||||
# per_device_eval_batch_size=16, # batch size for evaluation
|
||||
# learning_rate=2e-5,
|
||||
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
|
||||
# weight_decay=0.01, # strength of weight decay
|
||||
# logging_dir='./logs3', # directory for storing logs
|
||||
# logging_steps=1000,
|
||||
# evaluation_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
# load_best_model_at_end=True
|
||||
#)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
from transformers import BertForSequenceClassification, AdamW#, BertConfig
|
||||
#from torch.utils.data import TensorDataset, random_split
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
"""
|
||||
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
|
||||
train_dataset = train_dataset['train']
|
||||
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
|
||||
eval_dataset = eval_dataset['test']
|
||||
"""
|
||||
batch_size = 1
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
class PandasDataset(Dataset):
|
||||
def __init__(self, dataframe, tokenizer, max_length):
|
||||
self.dataframe = dataframe
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataframe)
|
||||
|
||||
def __getitem__(self, index):
|
||||
row = self.dataframe.iloc[index]
|
||||
text = row['text']
|
||||
labels = row['labels_encoded']
|
||||
|
||||
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
|
||||
input_ids = torch.tensor(encoded['input_ids'])
|
||||
attention_mask = torch.tensor(encoded['attention_mask'])
|
||||
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'labels': torch.tensor(labels) # Assuming labels are already encoded
|
||||
}
|
||||
|
||||
|
||||
train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=RandomSampler(train_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
|
||||
validation_dataloader = DataLoader(
|
||||
eval_dataset,
|
||||
sampler=SequentialSampler(eval_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
for idx, batch in enumerate(train_dataloader):
|
||||
print('Batch index: ', idx)
|
||||
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
|
||||
print('Batch label: ', batch['labels']) # Access 'labels' field
|
||||
break
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
num_labels = 2, # The number of output labels--2 for binary classification.
|
||||
# You can increase this for multi-class tasks.
|
||||
output_attentions = False, # Whether the model returns attentions weights.
|
||||
output_hidden_states = False, # Whether the model returns all hidden-states.
|
||||
)
|
||||
|
||||
#trainer = Trainer(
|
||||
# model=model, # the instantiated 🤗 Transformers model to be trained
|
||||
# args=training_args, # training arguments, defined above
|
||||
# train_dataset=train_dataset, # training dataset
|
||||
# eval_dataset=eval_dataset # evaluation dataset
|
||||
#)
|
||||
|
||||
|
||||
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
|
||||
# I believe the 'W' stands for 'Weight Decay fix"
|
||||
optimizer = AdamW(model.parameters(),
|
||||
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
|
||||
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
|
||||
)
|
||||
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
|
||||
# Number of training epochs. The BERT authors recommend between 2 and 4.
|
||||
# We chose to run for 6
|
||||
epochs = 6
|
||||
|
||||
# Total number of training steps is [number of batches] x [number of epochs].
|
||||
# (Note that this is not the same as the number of training samples).
|
||||
total_steps = len(train_dataloader) * epochs
|
||||
|
||||
# Create the learning rate scheduler.
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer,
|
||||
num_warmup_steps = 0, # Default value in run_glue.py
|
||||
num_training_steps = total_steps)
|
||||
|
||||
# Function to calculate the accuracy of our predictions vs labels
|
||||
def flat_accuracy(preds, labels):
|
||||
pred_flat = np.argmax(preds, axis=1).flatten()
|
||||
labels_flat = labels.flatten()
|
||||
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
||||
|
||||
import time
|
||||
import datetime
|
||||
|
||||
def format_time(elapsed):
|
||||
'''
|
||||
Takes a time in seconds and returns a string hh:mm:ss
|
||||
'''
|
||||
# Round to the nearest second.
|
||||
elapsed_rounded = int(round((elapsed)))
|
||||
|
||||
# Format as hh:mm:ss
|
||||
return str(datetime.timedelta(seconds=elapsed_rounded))
|
||||
|
||||
import random
|
||||
|
||||
# This training code is based on the `run_glue.py` script here:
|
||||
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
|
||||
|
||||
# Set the seed value all over the place to make this reproducible.
|
||||
seed_val = 12355
|
||||
|
||||
# If there's a GPU available...
|
||||
if torch.cuda.is_available():
|
||||
|
||||
# Tell PyTorch to use the GPU.
|
||||
device = torch.device("cuda")
|
||||
|
||||
print('There are %d GPU(s) available.' % torch.cuda.device_count())
|
||||
|
||||
print('We will use the GPU:', torch.cuda.get_device_name(0))
|
||||
#model.cuda()
|
||||
# If not...
|
||||
else:
|
||||
print('No GPU available, using the CPU instead.')
|
||||
device = torch.device("cpu")
|
||||
|
||||
device = torch.device("cpu")
|
||||
|
||||
random.seed(seed_val)
|
||||
np.random.seed(seed_val)
|
||||
torch.manual_seed(seed_val)
|
||||
torch.cuda.manual_seed_all(seed_val)
|
||||
|
||||
#%%
|
||||
# Start training
|
||||
# We'll store a number of quantities such as training and validation loss,
|
||||
# validation accuracy, and timings.
|
||||
training_stats = []
|
||||
|
||||
# Measure the total training time for the whole run.
|
||||
total_t0 = time.time()
|
||||
|
||||
# For each epoch...
|
||||
for epoch_i in range(0, epochs):
|
||||
# ========================================
|
||||
# Training
|
||||
# ========================================
|
||||
|
||||
# Perform one full pass over the training set.
|
||||
|
||||
print("")
|
||||
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
|
||||
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
|
||||
print('Training...')
|
||||
|
||||
# Measure how long the training epoch takes.
|
||||
t0 = time.time()
|
||||
model.to(device)
|
||||
# Reset the total loss for this epoch.
|
||||
total_train_loss = 0
|
||||
# Put the model into training mode. Don't be mislead--the call to
|
||||
# `train` just changes the *mode*, it doesn't *perform* the training.
|
||||
# `dropout` and `batchnorm` layers behave differently during training
|
||||
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
|
||||
model.train()
|
||||
|
||||
# For each batch of training data...
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
|
||||
# Progress update every 10 batches.
|
||||
if step % 10 == 0 and not step == 0:
|
||||
# Calculate elapsed time in minutes.
|
||||
elapsed = format_time(time.time() - t0)
|
||||
|
||||
# Report progress.
|
||||
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using the
|
||||
# `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
print("Batch keys:", batch.keys())
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Always clear any previously calculated gradients before performing a
|
||||
# backward pass. PyTorch doesn't do this automatically because
|
||||
# accumulating the gradients is "convenient while training RNNs".
|
||||
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
|
||||
model.zero_grad()
|
||||
|
||||
# Perform a forward pass (evaluate the model on this training batch).
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# It returns different numbers of parameters depending on what arguments
|
||||
# arge given and what flags are set. For our useage here, it returns
|
||||
# the loss (because we provided labels) and the "logits"--the model
|
||||
# outputs prior to activation.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the training loss over all of the batches so that we can
|
||||
# calculate the average loss at the end. `loss` is a Tensor containing a
|
||||
# single value; the `.item()` function just returns the Python value
|
||||
# from the tensor.
|
||||
total_train_loss += loss.item()
|
||||
|
||||
# Perform a backward pass to calculate the gradients.
|
||||
loss.backward()
|
||||
|
||||
# Clip the norm of the gradients to 1.0.
|
||||
# This is to help prevent the "exploding gradients" problem.
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||
|
||||
# Update parameters and take a step using the computed gradient.
|
||||
# The optimizer dictates the "update rule"--how the parameters are
|
||||
# modified based on their gradients, the learning rate, etc.
|
||||
optimizer.step()
|
||||
|
||||
# Update the learning rate.
|
||||
scheduler.step()
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_train_loss = total_train_loss / len(train_dataloader)
|
||||
|
||||
# Measure how long this epoch took.
|
||||
training_time = format_time(time.time() - t0)
|
||||
|
||||
print("")
|
||||
print(" Average training loss: {0:.2f}".format(avg_train_loss))
|
||||
print(" Training epcoh took: {:}".format(training_time))
|
||||
|
||||
# ========================================
|
||||
# Validation
|
||||
# ========================================
|
||||
# After the completion of each training epoch, measure our performance on
|
||||
# our validation set.
|
||||
|
||||
print("")
|
||||
print("Running Validation...")
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Put the model in evaluation mode--the dropout layers behave differently
|
||||
# during evaluation.
|
||||
model.eval()
|
||||
|
||||
# Tracking variables
|
||||
total_eval_accuracy = 0
|
||||
total_eval_loss = 0
|
||||
nb_eval_steps = 0
|
||||
|
||||
# Evaluate data for one epoch
|
||||
for batch in validation_dataloader:
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using
|
||||
# the `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Tell pytorch not to bother with constructing the compute graph during
|
||||
# the forward pass, since this is only needed for backprop (training).
|
||||
with torch.no_grad():
|
||||
|
||||
# Forward pass, calculate logit predictions.
|
||||
# token_type_ids is the same as the "segment ids", which
|
||||
# differentiates sentence 1 and 2 in 2-sentence tasks.
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# Get the "logits" output by the model. The "logits" are the output
|
||||
# values prior to applying an activation function like the softmax.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the validation loss.
|
||||
total_eval_loss += loss.item()
|
||||
|
||||
# Move logits and labels to CPU
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = b_labels.to('cpu').numpy()
|
||||
|
||||
# Calculate the accuracy for this batch of test sentences, and
|
||||
# accumulate it over all batches.
|
||||
total_eval_accuracy += flat_accuracy(logits, label_ids)
|
||||
|
||||
|
||||
# Report the final accuracy for this validation run.
|
||||
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
|
||||
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_val_loss = total_eval_loss / len(validation_dataloader)
|
||||
|
||||
# Measure how long the validation run took.
|
||||
validation_time = format_time(time.time() - t0)
|
||||
|
||||
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
|
||||
print(" Validation took: {:}".format(validation_time))
|
||||
|
||||
# Record all statistics from this epoch.
|
||||
training_stats.append(
|
||||
{
|
||||
'epoch': epoch_i + 1,
|
||||
'Training Loss': avg_train_loss,
|
||||
'Valid. Loss': avg_val_loss,
|
||||
'Valid. Accur.': avg_val_accuracy,
|
||||
'Training Time': training_time,
|
||||
'Validation Time': validation_time
|
||||
}
|
||||
)
|
||||
|
||||
print("")
|
||||
print("Training complete!")
|
||||
|
||||
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
|
||||
|
||||
params = list(model.named_parameters())
|
||||
|
||||
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
|
||||
|
||||
print('==== Embedding Layer ====\n')
|
||||
|
||||
for p in params[0:5]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== First Transformer ====\n')
|
||||
|
||||
for p in params[5:21]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== Output Layer ====\n')
|
||||
|
||||
for p in params[-4:]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
from datetime import datetime as dt
|
||||
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
now = dt.now().strftime(fTimeFormat)
|
||||
|
||||
output_dir = modFakeClassPath + now + "/"
|
||||
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
print("Saving model to %s" % output_dir)
|
||||
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Display floats with two decimal places.
|
||||
pd.set_option('display.precision', 2)
|
||||
|
||||
# Create a DataFrame from our training statistics.
|
||||
df_stats = pd.DataFrame(data=training_stats)
|
||||
|
||||
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
|
||||
df_stats = df_stats.set_index('epoch')
|
||||
|
||||
# A hack to force the column headers to wrap.
|
||||
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
|
||||
|
||||
|
||||
# Display the table.
|
||||
df_stats
|
||||
df_stats.to_csv(output_dir + now + ".csv")
|
607
trainTopic.py
Normal file
607
trainTopic.py
Normal file
@ -0,0 +1,607 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Aug 12 12:25:18 2023
|
||||
|
||||
@author: michael
|
||||
"""
|
||||
#from datasets import load_dataset
|
||||
#from transformers import Trainer
|
||||
#from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
import sys
|
||||
funs = wd+"funs"
|
||||
sys.path.insert(1, funs)
|
||||
import CleanTweets
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Training CSV dataset
|
||||
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
|
||||
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
|
||||
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
|
||||
statsTrainingTopicClass = "statsTopicClassification-"
|
||||
|
||||
# don't change this one
|
||||
twtCSVPath = wd + ud + twtCSV + ".csv"
|
||||
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
|
||||
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
|
||||
|
||||
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
|
||||
|
||||
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
|
||||
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
|
||||
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
|
||||
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
|
||||
|
||||
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
|
||||
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
|
||||
|
||||
seed = 12355
|
||||
|
||||
# Model paths
|
||||
modCovClassPath = wd + "models/CovClass/"
|
||||
modFakeClassPath = wd + "models/FakeClass/"
|
||||
|
||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||
|
||||
# More models for fake detection:
|
||||
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
max_length = 64 # max token sentence length
|
||||
|
||||
#%%
|
||||
# Create training and testing dataset
|
||||
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
|
||||
|
||||
#dfTest = dfTest[:-900] # remove last 800 rows
|
||||
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
|
||||
|
||||
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
|
||||
|
||||
dfTest.drop(columns=['rawContent'], inplace=True)
|
||||
|
||||
# Only keep tweets that are longer than 3 words
|
||||
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
|
||||
dfTest['tweet_proc_length'].value_counts()
|
||||
dfTest = dfTest[dfTest['tweet_proc_length']>3]
|
||||
dfTest = dfTest.drop_duplicates(subset=['text'])
|
||||
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
|
||||
|
||||
# Create datasets for each classification
|
||||
dfCovClass = dfTest
|
||||
dfFakeClass = dfTest
|
||||
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
|
||||
|
||||
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
|
||||
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
|
||||
|
||||
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
|
||||
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
|
||||
dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
|
||||
|
||||
#%%
|
||||
# Tokenize tweets
|
||||
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
|
||||
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
|
||||
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
|
||||
def encode_labels(label):
|
||||
if label == 'Covid':
|
||||
return 1
|
||||
elif label == 'NonCovid':
|
||||
return 0
|
||||
elif label == 'Fake':
|
||||
return 1
|
||||
elif label == 'True':
|
||||
return 0
|
||||
return 0
|
||||
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
|
||||
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
|
||||
|
||||
# get n of classes
|
||||
print("# of Non-Covid tweets (coded 0):")
|
||||
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
|
||||
|
||||
print("# of Fake-news tweets (coded 1):")
|
||||
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
|
||||
|
||||
# create disproportionate sample - 50/50 of both
|
||||
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
|
||||
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
|
||||
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
|
||||
|
||||
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
|
||||
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
|
||||
dfCovClassab.reset_index(inplace=True)
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
|
||||
'''
|
||||
|
||||
# create training and validation samples
|
||||
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
|
||||
|
||||
# reset index and drop unnecessary columns
|
||||
dfCovClass_train.reset_index(drop=True, inplace=True)
|
||||
dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
dfCovClass_test.reset_index(drop=True, inplace=True)
|
||||
dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
|
||||
dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
|
||||
|
||||
# save dfs as csvs and tsvs, for training and validation
|
||||
# covid classification datafiles
|
||||
# rows 0-41 = noncovid, 42-81 covid, therfore:
|
||||
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
|
||||
#dfCovClass.reset_index(inplace=True, drop=True)
|
||||
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
|
||||
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
# fake news classification datafiles
|
||||
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
|
||||
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
|
||||
|
||||
#%%
|
||||
# Prepare trainer
|
||||
#from transformers import TrainingArguments
|
||||
|
||||
#training_args = TrainingArguments(
|
||||
# report_to = 'wandb',
|
||||
# output_dir=wd+'results', # output directory/
|
||||
# overwrite_output_dir = True,
|
||||
# num_train_epochs=6, # total number of training epochs
|
||||
# per_device_train_batch_size=8, # batch size per device during training
|
||||
# per_device_eval_batch_size=16, # batch size for evaluation
|
||||
# learning_rate=2e-5,
|
||||
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
|
||||
# weight_decay=0.01, # strength of weight decay
|
||||
# logging_dir='./logs3', # directory for storing logs
|
||||
# logging_steps=1000,
|
||||
# evaluation_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
# load_best_model_at_end=True
|
||||
#)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
from transformers import BertForSequenceClassification, AdamW#, BertConfig
|
||||
#from torch.utils.data import TensorDataset, random_split
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
"""
|
||||
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
|
||||
train_dataset = train_dataset['train']
|
||||
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
|
||||
eval_dataset = eval_dataset['test']
|
||||
"""
|
||||
batch_size = 1
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
class PandasDataset(Dataset):
|
||||
def __init__(self, dataframe, tokenizer, max_length):
|
||||
self.dataframe = dataframe
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataframe)
|
||||
|
||||
def __getitem__(self, index):
|
||||
row = self.dataframe.iloc[index]
|
||||
text = row['text']
|
||||
labels = row['labels_encoded']
|
||||
|
||||
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
|
||||
input_ids = torch.tensor(encoded['input_ids'])
|
||||
attention_mask = torch.tensor(encoded['attention_mask'])
|
||||
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'labels': torch.tensor(labels) # Assuming labels are already encoded
|
||||
}
|
||||
|
||||
|
||||
train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=RandomSampler(train_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
|
||||
validation_dataloader = DataLoader(
|
||||
eval_dataset,
|
||||
sampler=SequentialSampler(eval_dataset),
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
for idx, batch in enumerate(train_dataloader):
|
||||
print('Batch index: ', idx)
|
||||
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
|
||||
print('Batch label: ', batch['labels']) # Access 'labels' field
|
||||
break
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
model_name,
|
||||
num_labels = 2, # The number of output labels--2 for binary classification.
|
||||
# You can increase this for multi-class tasks.
|
||||
output_attentions = False, # Whether the model returns attentions weights.
|
||||
output_hidden_states = False, # Whether the model returns all hidden-states.
|
||||
)
|
||||
|
||||
#trainer = Trainer(
|
||||
# model=model, # the instantiated 🤗 Transformers model to be trained
|
||||
# args=training_args, # training arguments, defined above
|
||||
# train_dataset=train_dataset, # training dataset
|
||||
# eval_dataset=eval_dataset # evaluation dataset
|
||||
#)
|
||||
|
||||
|
||||
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
|
||||
# I believe the 'W' stands for 'Weight Decay fix"
|
||||
optimizer = AdamW(model.parameters(),
|
||||
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
|
||||
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
|
||||
)
|
||||
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
|
||||
# Number of training epochs. The BERT authors recommend between 2 and 4.
|
||||
# We chose to run for 6
|
||||
epochs = 6
|
||||
|
||||
# Total number of training steps is [number of batches] x [number of epochs].
|
||||
# (Note that this is not the same as the number of training samples).
|
||||
total_steps = len(train_dataloader) * epochs
|
||||
|
||||
# Create the learning rate scheduler.
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer,
|
||||
num_warmup_steps = 0, # Default value in run_glue.py
|
||||
num_training_steps = total_steps)
|
||||
|
||||
# Function to calculate the accuracy of our predictions vs labels
|
||||
def flat_accuracy(preds, labels):
|
||||
pred_flat = np.argmax(preds, axis=1).flatten()
|
||||
labels_flat = labels.flatten()
|
||||
return np.sum(pred_flat == labels_flat) / len(labels_flat)
|
||||
|
||||
import time
|
||||
import datetime
|
||||
|
||||
def format_time(elapsed):
|
||||
'''
|
||||
Takes a time in seconds and returns a string hh:mm:ss
|
||||
'''
|
||||
# Round to the nearest second.
|
||||
elapsed_rounded = int(round((elapsed)))
|
||||
|
||||
# Format as hh:mm:ss
|
||||
return str(datetime.timedelta(seconds=elapsed_rounded))
|
||||
|
||||
import random
|
||||
|
||||
# This training code is based on the `run_glue.py` script here:
|
||||
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
|
||||
|
||||
# Set the seed value all over the place to make this reproducible.
|
||||
seed_val = 12355
|
||||
|
||||
# If there's a GPU available...
|
||||
if torch.cuda.is_available():
|
||||
|
||||
# Tell PyTorch to use the GPU.
|
||||
device = torch.device("cuda")
|
||||
|
||||
print('There are %d GPU(s) available.' % torch.cuda.device_count())
|
||||
|
||||
print('We will use the GPU:', torch.cuda.get_device_name(0))
|
||||
#model.cuda()
|
||||
# If not...
|
||||
else:
|
||||
print('No GPU available, using the CPU instead.')
|
||||
device = torch.device("cpu")
|
||||
|
||||
device = torch.device("cpu")
|
||||
|
||||
random.seed(seed_val)
|
||||
np.random.seed(seed_val)
|
||||
torch.manual_seed(seed_val)
|
||||
torch.cuda.manual_seed_all(seed_val)
|
||||
|
||||
#%%
|
||||
# Start training
|
||||
# We'll store a number of quantities such as training and validation loss,
|
||||
# validation accuracy, and timings.
|
||||
training_stats = []
|
||||
|
||||
# Measure the total training time for the whole run.
|
||||
total_t0 = time.time()
|
||||
|
||||
# For each epoch...
|
||||
for epoch_i in range(0, epochs):
|
||||
# ========================================
|
||||
# Training
|
||||
# ========================================
|
||||
|
||||
# Perform one full pass over the training set.
|
||||
|
||||
print("")
|
||||
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
|
||||
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
|
||||
print('Training...')
|
||||
|
||||
# Measure how long the training epoch takes.
|
||||
t0 = time.time()
|
||||
model.to(device)
|
||||
# Reset the total loss for this epoch.
|
||||
total_train_loss = 0
|
||||
# Put the model into training mode. Don't be mislead--the call to
|
||||
# `train` just changes the *mode*, it doesn't *perform* the training.
|
||||
# `dropout` and `batchnorm` layers behave differently during training
|
||||
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
|
||||
model.train()
|
||||
|
||||
# For each batch of training data...
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
|
||||
# Progress update every 10 batches.
|
||||
if step % 10 == 0 and not step == 0:
|
||||
# Calculate elapsed time in minutes.
|
||||
elapsed = format_time(time.time() - t0)
|
||||
|
||||
# Report progress.
|
||||
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using the
|
||||
# `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
print("Batch keys:", batch.keys())
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Always clear any previously calculated gradients before performing a
|
||||
# backward pass. PyTorch doesn't do this automatically because
|
||||
# accumulating the gradients is "convenient while training RNNs".
|
||||
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
|
||||
model.zero_grad()
|
||||
|
||||
# Perform a forward pass (evaluate the model on this training batch).
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# It returns different numbers of parameters depending on what arguments
|
||||
# arge given and what flags are set. For our useage here, it returns
|
||||
# the loss (because we provided labels) and the "logits"--the model
|
||||
# outputs prior to activation.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the training loss over all of the batches so that we can
|
||||
# calculate the average loss at the end. `loss` is a Tensor containing a
|
||||
# single value; the `.item()` function just returns the Python value
|
||||
# from the tensor.
|
||||
total_train_loss += loss.item()
|
||||
|
||||
# Perform a backward pass to calculate the gradients.
|
||||
loss.backward()
|
||||
|
||||
# Clip the norm of the gradients to 1.0.
|
||||
# This is to help prevent the "exploding gradients" problem.
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||
|
||||
# Update parameters and take a step using the computed gradient.
|
||||
# The optimizer dictates the "update rule"--how the parameters are
|
||||
# modified based on their gradients, the learning rate, etc.
|
||||
optimizer.step()
|
||||
|
||||
# Update the learning rate.
|
||||
scheduler.step()
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_train_loss = total_train_loss / len(train_dataloader)
|
||||
|
||||
# Measure how long this epoch took.
|
||||
training_time = format_time(time.time() - t0)
|
||||
|
||||
print("")
|
||||
print(" Average training loss: {0:.2f}".format(avg_train_loss))
|
||||
print(" Training epcoh took: {:}".format(training_time))
|
||||
|
||||
# ========================================
|
||||
# Validation
|
||||
# ========================================
|
||||
# After the completion of each training epoch, measure our performance on
|
||||
# our validation set.
|
||||
|
||||
print("")
|
||||
print("Running Validation...")
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Put the model in evaluation mode--the dropout layers behave differently
|
||||
# during evaluation.
|
||||
model.eval()
|
||||
|
||||
# Tracking variables
|
||||
total_eval_accuracy = 0
|
||||
total_eval_loss = 0
|
||||
nb_eval_steps = 0
|
||||
|
||||
# Evaluate data for one epoch
|
||||
for batch in validation_dataloader:
|
||||
|
||||
# Unpack this training batch from our dataloader.
|
||||
#
|
||||
# As we unpack the batch, we'll also copy each tensor to the GPU using
|
||||
# the `to` method.
|
||||
#
|
||||
# `batch` contains three pytorch tensors:
|
||||
# [0]: input ids
|
||||
# [1]: attention masks
|
||||
# [2]: labels
|
||||
b_input_ids = batch['input_ids'].to(device)
|
||||
b_input_mask = batch['attention_mask'].to(device)
|
||||
b_labels = batch['labels'].to(device)
|
||||
|
||||
# Tell pytorch not to bother with constructing the compute graph during
|
||||
# the forward pass, since this is only needed for backprop (training).
|
||||
with torch.no_grad():
|
||||
|
||||
# Forward pass, calculate logit predictions.
|
||||
# token_type_ids is the same as the "segment ids", which
|
||||
# differentiates sentence 1 and 2 in 2-sentence tasks.
|
||||
# The documentation for this `model` function is here:
|
||||
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
|
||||
# Get the "logits" output by the model. The "logits" are the output
|
||||
# values prior to applying an activation function like the softmax.
|
||||
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
|
||||
loss = output[0]
|
||||
logits = output[1]
|
||||
|
||||
# Accumulate the validation loss.
|
||||
total_eval_loss += loss.item()
|
||||
|
||||
# Move logits and labels to CPU
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = b_labels.to('cpu').numpy()
|
||||
|
||||
# Calculate the accuracy for this batch of test sentences, and
|
||||
# accumulate it over all batches.
|
||||
total_eval_accuracy += flat_accuracy(logits, label_ids)
|
||||
|
||||
|
||||
# Report the final accuracy for this validation run.
|
||||
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
|
||||
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
|
||||
|
||||
# Calculate the average loss over all of the batches.
|
||||
avg_val_loss = total_eval_loss / len(validation_dataloader)
|
||||
|
||||
# Measure how long the validation run took.
|
||||
validation_time = format_time(time.time() - t0)
|
||||
|
||||
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
|
||||
print(" Validation took: {:}".format(validation_time))
|
||||
|
||||
# Record all statistics from this epoch.
|
||||
training_stats.append(
|
||||
{
|
||||
'epoch': epoch_i + 1,
|
||||
'Training Loss': avg_train_loss,
|
||||
'Valid. Loss': avg_val_loss,
|
||||
'Valid. Accur.': avg_val_accuracy,
|
||||
'Training Time': training_time,
|
||||
'Validation Time': validation_time
|
||||
}
|
||||
)
|
||||
|
||||
print("")
|
||||
print("Training complete!")
|
||||
|
||||
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
|
||||
|
||||
params = list(model.named_parameters())
|
||||
|
||||
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
|
||||
|
||||
print('==== Embedding Layer ====\n')
|
||||
|
||||
for p in params[0:5]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== First Transformer ====\n')
|
||||
|
||||
for p in params[5:21]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
print('\n==== Output Layer ====\n')
|
||||
|
||||
for p in params[-4:]:
|
||||
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
from datetime import datetime as dt
|
||||
|
||||
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
|
||||
now = dt.now().strftime(fTimeFormat)
|
||||
|
||||
output_dir = modCovClassPath + now + "/"
|
||||
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
print("Saving model to %s" % output_dir)
|
||||
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Display floats with two decimal places.
|
||||
pd.set_option('display.precision', 2)
|
||||
|
||||
# Create a DataFrame from our training statistics.
|
||||
df_stats = pd.DataFrame(data=training_stats)
|
||||
|
||||
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
|
||||
df_stats = df_stats.set_index('epoch')
|
||||
|
||||
# A hack to force the column headers to wrap.
|
||||
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
|
||||
|
||||
|
||||
# Display the table.
|
||||
df_stats
|
||||
df_stats.to_csv(output_dir + now + ".csv")
|
Loading…
x
Reference in New Issue
Block a user