From 80b63b39df0a6d582b4f360f9ba73907db4fc476 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Wed, 30 Aug 2023 21:45:38 +0200 Subject: [PATCH] adds readme --- README.md | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 93e1db5..6febf71 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,127 @@ -# How to use +# Requirements -Execute collect.py to scrape tweets and generate the ´ALL-SENATORS-TWEETS.csv´. +- python 3.10+ +- snscrape 0.6.2.20230321+ (see git repo in this folder) +- transformers 4.31.0 +- numpy 1.23.5 +- pandas 2.0.3 +- scikit-learn 1.3.0 +- torch 2.0.1 -Execute collectSenData.py to scrape senator data and generate ´ALL-SENATORS.csv´. +# About -All new files will be written to ´data/OUT/´. Necessary data has to be located in ´data/IN/´ \ No newline at end of file +This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets. +Training only works with a prepared dataset in which the tweets are pre classified. +More info in the comments of the scripts. +Due to time constraints, most of the code is procedurally coded and ugly but effective. + +# How to + +Tested on Ubuntu 22.04. +If needed, the virual environment can be exported and send to you. + +All files in the folder data/in have to exist in order to execute the scripts. +Execute in the following order: + +01 collect.py (see more for further info on scraping) +02 collectSenData.py +03 cleanTweets +04 preTestClassification.py +05 trainTopic.py +06 trainFake.py +07 ClassificationFake.py +08 ClassificationTopic.py + +# Files & Folders + +├── data +│   ├── IN +│   │   ├── counterKeywordsFinal.txt +│   │   ├── counterKeywords.txt +│   │   ├── keywords-raw.txt +│   │   ├── keywords.txt +│   │   ├── own_keywords.txt +│   │   ├── pretest-tweets_fake.txt contains tweet ids for pretest +│   │   ├── pretest-tweets_not_fake.txt contains tweet ids for pretest +│   │   └── senators-raw.csv senator datafile +│   ├── OUT +│   │   ├── ALL-SENATORS-TWEETS.csv +│   │   ├── graphs +│   │   │   ├── Timeline.png +│   │   │   ├── Wordcloud-All.png +│   │   │   └── Wordcloud-Cov.png +│   │   ├── Pretest-Prep.csv +│   │   ├── Pretest-Results.csv +│   │   ├── Pretest-SENATORS-TWEETS.csv +│   │   ├── profiles dataset profiles +│   │   │   ├── AllTweets.html +│   │   │   └── CovTweets.html +│   │   ├── SenatorsTweets-Final.csv +│   │   ├── SenatorsTweets-OnlyCov.csv +│   │   ├── SenatorsTweets-train-CovClassification.csv +│   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv +│   │   ├── SenatorsTweets-train-CovClassification.tsv +│   │   ├── SenatorsTweets-train-FakeClassification.csv +│   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv +│   │   ├── SenatorsTweets-train-FakeClassification.tsv +│   │   ├── SenatorsTweets-Training.csv +│   │   ├── SenatorsTweets-Training_WORKING-COPY.csv +│   │   ├── topClass-PRETEST-Prep.csv +│   │   ├── topClass-PRETEST-Results.csv +│   │   ├── Tweets-All-slices.zip +│   │   ├── Tweets-Classified-Fake-Prep.csv +│   │   ├── Tweets-Classified-Fake-Results.csv +│   │   ├── Tweets-Classified-Prep.csv +│   │   ├── Tweets-Classified-Topic-Prep.csv +│   │   ├── Tweets-Classified-Topic-Results.csv +│   │   └── Tweets-Stub.csv +├── funs +│   ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing +│   ├── ClearDupes.py function for deletion of duplicate keywords +│   ├── __init__.py +│   ├── Scrape.py scraper functions to be used for multiprocessing +│   └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing +├── log logs of the scraping process +│   ├── log_2023-06-23_21-06-10_err.log +│   ├── log_2023-06-23_21-06-10.log +│   └── log_2023-06-23_21-06-10_missing.log +├── models +│   ├── CovClass Covid tweet classification model +│   │   └── 2023-08-15_05-56-50 +│   │   ├── 2023-08-15_05-56-50.csv training output +│   │   ├── config.json +│   │   ├── pytorch_model.bin +│   │   ├── special_tokens_map.json +│   │   ├── tokenizer_config.json +│   │   ├── tokenizer.json +│   │   └── vocab.txt +│   └── FakeClass Fake tweet classification model +│   └── 2023-08-15_14-35-43 +│   ├── 2023-08-15_14-35-43.csv training output +│   ├── config.json +│   ├── pytorch_model.bin +│   ├── special_tokens_map.json +│   ├── tokenizer_config.json +│   ├── tokenizer.json +│   └── vocab.txt +├── snscrape contains snscrape 0.6.2.20230321+ git repo +├── ClassificationFake.py classifies tweets as fake or non-fake, saves: +│ Tweets-Classified-Fake-Prep.csv - prepared training dataset +│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results +├── ClassificationTopic.py classifies tweet topic, saves: +│ Tweets-Classified-Topic-Prep.csv - prepared training dataset +│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results +├── cleanTweets.py Curates keywordlists +│ Merges senator and tweet datasets +│ Creates multiple datasets: +│ SenatorsTweets-Final.csv - all tweets with keyword columns +│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist +│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv +├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv +├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv +├── createGraphs.py creates wordcloud & timeline graphs +├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets +├── profiler.py creates dataset profiles +├── README.md readme +├── trainFake.py training script for the fake tweet classification model +└── trainTopic.py training script for the tweet topic classification model