import numpy as np import pandas as pd from datetime import datetime from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from datasets import load_dataset from transformers.pipelines.pt_utils import KeyDataset #%% # prepare & define paths # install xformers (pip install xformers) for better performance ################### # Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # datafile input directory di = "data/IN/" # Tweet-datafile output directory ud = "data/OUT/" # Name of file that all senator data will be written to senCSV = "SenatorsTweets-OnlyCov.csv" # Name of Classify datafile senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" # don't change this one senCSVPath = wd + ud + senCSV senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult import sys funs = wd+"funs" sys.path.insert(1, funs) import CleanTweets #%% # get datafra,e dfClassify = pd.read_csv(senCSVPath, dtype=(object)) # dataframe from csv dfClassify['fake'] = False #%% # https://huggingface.co/bvrau/covid-twitter-bert-v2-struth # HowTo: # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text) #%% # remove empty rows dfClassify.cleanContent.replace('',np.nan,inplace=True) dfClassify.dropna(subset=['cleanContent'], inplace=True) #%% timeStart = datetime.now() # start counting execution time max_length = 128 dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) #train.rename(columns={'target': 'labels'}, inplace=True) #train.head() # %% dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent']) #%% dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath) # %%from datetime import datetime #from tqdm.auto import tqdm #for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))): # print(out) #%% output_labels = [] output_score = [] for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"): output_labels.append(out['label']) output_score.append(out['score']) # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] # Exactly the same output as before, but the content are passed # as batches to the model # %% dfClassify['output_label_topicCov'] = output_labels dfClassify['output_score_topicCov'] = output_score timeEnd = datetime.now() timeTotal = timeEnd - timeStart timePerTweet = timeTotal / 96 print(f"Total classification execution time: {timeTotal} seconds") print(f"Time per tweet classification: {timePerTweet}") # %% dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') # %% ## corrections def encode_labels(label): if label == 'real': return 'True' elif label == 'fake': return 'False' return 0 dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels) dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') #still wrong, will be corrected in ClassificationFake.py