From 2535683cdccf33f25f09cf14e0acaf3519e94417 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Tue, 15 Aug 2023 14:51:28 +0200 Subject: [PATCH] finishes classification scripts --- ClassificationFake.py | 19 ++++++++----------- ClassificationTopic.py | 7 ++----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/ClassificationFake.py b/ClassificationFake.py index db99b2b..b48aa9e 100644 --- a/ClassificationFake.py +++ b/ClassificationFake.py @@ -1,12 +1,9 @@ -import re -import string import numpy as np import pandas as pd from datetime import datetime from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from datasets import load_dataset from transformers.pipelines.pt_utils import KeyDataset -from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct #%% @@ -26,11 +23,11 @@ di = "data/IN/" ud = "data/OUT/" # Name of file that all senator data will be written to -senCSV = "SenatorsTweets-OnlyCov.csv" +senCSV = "Tweets-Classified-Topic-Results.csv" # Name of Classify datafile -senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" -senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" +senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv" +senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv" # don't change this one senCSVPath = wd + ud + senCSV @@ -56,9 +53,9 @@ dfClassify['fake'] = False # HowTo: # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline -pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") -model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") -tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") +pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/") +model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/") +tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/") # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert @@ -100,8 +97,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun # Exactly the same output as before, but the content are passed # as batches to the model # %% -dfClassify['output_label'] = output_labels -dfClassify['output_score'] = output_score +dfClassify['output_label_fake'] = output_labels +dfClassify['output_score_fake'] = output_score timeEnd = datetime.now() timeTotal = timeEnd - timeStart diff --git a/ClassificationTopic.py b/ClassificationTopic.py index e796a45..4605834 100644 --- a/ClassificationTopic.py +++ b/ClassificationTopic.py @@ -1,12 +1,9 @@ -import re -import string import numpy as np import pandas as pd from datetime import datetime from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from datasets import load_dataset from transformers.pipelines.pt_utils import KeyDataset -from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct #%% @@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun # Exactly the same output as before, but the content are passed # as batches to the model # %% -dfClassify['output_label'] = output_labels -dfClassify['output_score'] = output_score +dfClassify['output_label_topicCov'] = output_labels +dfClassify['output_score_topicCov'] = output_score timeEnd = datetime.now() timeTotal = timeEnd - timeStart