From 1c6d9d54159f7cb1e491b72017ca1c0383f99e08 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Wed, 30 Aug 2023 21:18:55 +0200 Subject: [PATCH] cleans and renames files --- Classification.py | 113 --------------------------------- analyze.py | 129 -------------------------------------- data/IN/keywords.txt | 53 ++++++++-------- trainFake.py | 6 +- train.py => trainTopic.py | 10 +-- 5 files changed, 29 insertions(+), 282 deletions(-) delete mode 100644 Classification.py delete mode 100644 analyze.py rename train.py => trainTopic.py (97%) diff --git a/Classification.py b/Classification.py deleted file mode 100644 index f334ce3..0000000 --- a/Classification.py +++ /dev/null @@ -1,113 +0,0 @@ -import re -import string -import numpy as np -import pandas as pd -from datetime import datetime -from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline -from datasets import load_dataset -from transformers.pipelines.pt_utils import KeyDataset -from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct - - -#%% -# prepare & define paths -# install xformers (pip install xformers) for better performance -################### -# Setup directories -# WD Michael -wd = "/home/michael/Documents/PS/Data/collectTweets/" -# WD Server -# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' - -# datafile input directory -di = "data/IN/" - -# Tweet-datafile output directory -ud = "data/OUT/" - -# Name of file that all senator data will be written to -senCSV = "SenatorsTweets-OnlyCov.csv" - -# Name of Classify datafile -senCSVClassifiedPrep = "Tweets-Classified-Prep.csv" -senCSVClassifiedResult = "Tweets-Classified-Results.csv" - -# don't change this one -senCSVPath = wd + ud + senCSV -senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep -senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult - -#%% -# get datafra,e -dfClassify = pd.read_csv(senCSVPath, dtype=(object)) - -# dataframe from csv -dfClassify['fake'] = False - - -#%% -# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth -# HowTo: -# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification -# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline -pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") -model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") -tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") - -# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert - -dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL) -dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji) -dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html) -dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct) -dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower()) - -#%% -# remove empty rows -dfClassify.cleanContent.replace('',np.nan,inplace=True) -dfClassify.dropna(subset=['cleanContent'], inplace=True) - -#%% -timeStart = datetime.now() # start counting execution time - -max_length = 128 -dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) -#train.rename(columns={'target': 'labels'}, inplace=True) -#train.head() - -# %% -dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent']) - -#%% -dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath) - -# %%from datetime import datetime - -#from tqdm.auto import tqdm -#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))): -# print(out) - -#%% -output_labels = [] -output_score = [] -for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"): - output_labels.append(out['label']) - output_score.append(out['score']) - # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] - # Exactly the same output as before, but the content are passed - # as batches to the model -# %% -dfClassify['output_label'] = output_labels -dfClassify['output_score'] = output_score - -timeEnd = datetime.now() -timeTotal = timeEnd - timeStart -timePerTweet = timeTotal / 96 - -print(f"Total classification execution time: {timeTotal} seconds") -print(f"Time per tweet classification: {timePerTweet}") - -# %% -dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') - -# %% diff --git a/analyze.py b/analyze.py deleted file mode 100644 index 5896d0c..0000000 --- a/analyze.py +++ /dev/null @@ -1,129 +0,0 @@ -import re -import string -import numpy as np -import pandas as pd -from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline -from datasets import load_dataset -from transformers.pipelines.pt_utils import KeyDataset -from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct - - -#%% -# prepare -# install xformers (pip install xformers) for better performance -################### -# Setup directories -# WD Michael -wd = "/home/michael/Documents/PS/Data/collectTweets/" -# WD Server -# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' - -# datafile input directory -di = "data/IN/" - -# Tweet-datafile output directory -ud = "data/OUT/" - -# Name of file that all senator data will be written to -senCSV = "ALL-SENATORS-TWEETS.csv" - -# Name of new datafile generated -senCSVc = "Tweets-Stub.csv" - -# Name of pretest files -preTestIDsFake = "pretest-tweets_fake.txt" -preTestIDsNot = "pretest-tweets_not_fake.txt" - -# Name of pretest datafile -senCSVPretest = "Pretest.csv" -senCSVPretestPrep = "Pretest-Prep.csv" -senCSVPretestResult = "Pretest-Results.csv" - - -# don't change this one -senCSVPath = wd + ud + senCSV -senCSVcPath = wd + ud + senCSVc -senCSVcPretestPath = wd + ud + senCSVPretest -senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep -senCSVcPretestResultPath = wd + ud + senCSVPretestResult -preTestIDsFakePath = wd + di + preTestIDsFake -preTestIDsNotPath = wd + di + preTestIDsNot - -# List of IDs to select -# Read the IDs from a file -preTestIDsFakeL = [] -preTestIDsNotL = [] -with open(preTestIDsFakePath, "r") as file: - lines = file.readlines() - for line in lines: - tid = line.strip() # Remove the newline character - preTestIDsFakeL.append(tid) -with open(preTestIDsNotPath, "r") as file: - lines = file.readlines() - for line in lines: - tid = line.strip() # Remove the newline character - preTestIDsNotL.append(tid) - -# Select rows based on the IDs -df = pd.read_csv(senCSVPath, dtype=(object)) -#%% -# Create pretest dataframe -dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy() -dfPreTest['fake'] = True -dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True) -dfPreTest['fake'] = dfPreTest['fake'].fillna(False) - -#%% -# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth -# HowTo: -# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification -# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline -pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") -model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") -tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") - -# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert - -dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower()) - -#%% -max_length = 128 -dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) -#train.rename(columns={'target': 'labels'}, inplace=True) -#train.head() - -# %% -dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent']) - - -#%% -dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath) - -# %% -results = pipe(KeyDataset(dataset, "text")) -# %% -#from tqdm.auto import tqdm -#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))): -# print(out) - -#%% -output_labels = [] -output_score = [] -for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"): - output_labels.append(out['label']) - output_score.append(out['score']) - # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] - # Exactly the same output as before, but the content are passed - # as batches to the model -# %% -dfPreTest['output_label'] = output_labels -dfPreTest['output_score'] = output_score - -# %% -dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8') - -# %% diff --git a/data/IN/keywords.txt b/data/IN/keywords.txt index c59569a..1680fe0 100644 --- a/data/IN/keywords.txt +++ b/data/IN/keywords.txt @@ -18,44 +18,43 @@ socialdistancing wear a mask lockdown covd -Coronavirus -Koronavirus -Corona -CDC -Wuhancoronavirus -Wuhanlockdown -Ncov -Wuhan -N95 -Kungflu -Epidemic +coronavirus +koronavirus +corona +cdc +wuhancoronavirus +wuhanlockdown +ncov +wuhan +n95 +kungflu +epidemic outbreak -Sinophobia -China +sinophobia covid-19 corona virus covid covid19 sars-cov-2 -COVIDー19 -COVD +covidー19 +covd pandemic coronapocalypse canceleverything -Coronials -SocialDistancingNow -Social Distancing -SocialDistancing +coronials +socialdistancingnow +social distancing +socialdistancing panicbuy panic buy panicbuying panic buying -14DayQuarantine -DuringMy14DayQuarantine +14dayquarantine +duringmy14dayquarantine panic shop panic shopping panicshop -InMyQuarantineSurvivalKit +inmyquarantinesurvivalkit panic-buy panic-shop coronakindness @@ -65,7 +64,7 @@ chinesevirus stayhomechallenge stay home challenge sflockdown -DontBeASpreader +dontbeaspreader lockdown lock down shelteringinplace @@ -79,13 +78,13 @@ flatten the curve china virus chinavirus quarentinelife -PPEshortage +ppeshortage saferathome stayathome stay at home stay home stayhome -GetMePPE +getmeppe covidiot epitwitter pandemie @@ -93,7 +92,7 @@ wear a mask wearamask kung flu covididiot -COVID__19 +covid__19 omicron variant vaccine @@ -139,9 +138,7 @@ work from home workfromhome working from home workingfromhome -ppe n95 -ppe n95 covidiots covidiots diff --git a/trainFake.py b/trainFake.py index 14d8f84..ec9c5b3 100644 --- a/trainFake.py +++ b/trainFake.py @@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn import pandas as pd -## Follow these two guides: -# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ -# https://xiangyutang2.github.io/tweet-classification/ -# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf +## Uses snippets from this guide: +# https://mccormickml.com/2019/07/22/BERT-fine-tuning/ ################### # Setup directories diff --git a/train.py b/trainTopic.py similarity index 97% rename from train.py rename to trainTopic.py index 5b3cff6..3bf55e5 100644 --- a/train.py +++ b/trainTopic.py @@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn import pandas as pd -## Follow these two guides: -# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ -# https://xiangyutang2.github.io/tweet-classification/ -# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf +## Uses snippets from this guide: +# https://mccormickml.com/2019/07/22/BERT-fine-tuning/ ################### # Setup directories @@ -65,11 +63,7 @@ seed = 12355 modCovClassPath = wd + "models/CovClass/" modFakeClassPath = wd + "models/FakeClass/" -model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69 -#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48 -#model_name = "cardiffnlp/tweet-topic-latest-multi" model_name = "bvrau/covid-twitter-bert-v2-struth" -#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all" model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' # More models for fake detection: