finishes classification scripts

This commit is contained in:
Michael Beck 2023-08-15 14:51:28 +02:00
parent 8f744a08be
commit 2535683cdc
2 changed files with 10 additions and 16 deletions

View File

@ -1,12 +1,9 @@
import re
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%% #%%
@ -26,11 +23,11 @@ di = "data/IN/"
ud = "data/OUT/" ud = "data/OUT/"
# Name of file that all senator data will be written to # Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv" senCSV = "Tweets-Classified-Topic-Results.csv"
# Name of Classify datafile # Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
@ -56,9 +53,9 @@ dfClassify['fake'] = False
# HowTo: # HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
@ -100,8 +97,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
# Exactly the same output as before, but the content are passed # Exactly the same output as before, but the content are passed
# as batches to the model # as batches to the model
# %% # %%
dfClassify['output_label'] = output_labels dfClassify['output_label_fake'] = output_labels
dfClassify['output_score'] = output_score dfClassify['output_score_fake'] = output_score
timeEnd = datetime.now() timeEnd = datetime.now()
timeTotal = timeEnd - timeStart timeTotal = timeEnd - timeStart

View File

@ -1,12 +1,9 @@
import re
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%% #%%
@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
# Exactly the same output as before, but the content are passed # Exactly the same output as before, but the content are passed
# as batches to the model # as batches to the model
# %% # %%
dfClassify['output_label'] = output_labels dfClassify['output_label_topicCov'] = output_labels
dfClassify['output_score'] = output_score dfClassify['output_score_topicCov'] = output_score
timeEnd = datetime.now() timeEnd = datetime.now()
timeTotal = timeEnd - timeStart timeTotal = timeEnd - timeStart