124 lines
4.0 KiB
Python
124 lines
4.0 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
|
from datasets import load_dataset
|
|
from transformers.pipelines.pt_utils import KeyDataset
|
|
|
|
|
|
#%%
|
|
# prepare & define paths
|
|
# install xformers (pip install xformers) for better performance
|
|
###################
|
|
# Setup directories
|
|
# WD Michael
|
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
# WD Server
|
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
|
|
# datafile input directory
|
|
di = "data/IN/"
|
|
|
|
# Tweet-datafile output directory
|
|
ud = "data/OUT/"
|
|
|
|
# Name of file that all senator data will be written to
|
|
senCSV = "Tweets-Classified-Topic-Results.csv"
|
|
|
|
# Name of Classify datafile
|
|
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
|
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
|
|
|
# don't change this one
|
|
senCSVPath = wd + ud + senCSV
|
|
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
|
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
|
|
|
import sys
|
|
funs = wd+"funs"
|
|
sys.path.insert(1, funs)
|
|
import CleanTweets
|
|
|
|
|
|
#%%
|
|
# get datafra,e
|
|
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
|
def encode_labels(label):
|
|
if label == 'True':
|
|
return 'False'
|
|
elif label == 'False':
|
|
return 'True'
|
|
return 0
|
|
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
|
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
|
|
|
|
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
|
|
|
|
# dataframe from csv
|
|
dfClassify['fake'] = False
|
|
|
|
|
|
#%%
|
|
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
|
# HowTo:
|
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
|
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
|
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
|
|
|
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
|
|
|
|
|
|
#%%
|
|
# remove empty rows
|
|
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
|
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
|
|
|
#%%
|
|
timeStart = datetime.now() # start counting execution time
|
|
|
|
max_length = 128
|
|
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
|
#train.rename(columns={'target': 'labels'}, inplace=True)
|
|
#train.head()
|
|
|
|
# %%
|
|
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
|
|
|
#%%
|
|
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
|
|
|
# %%from datetime import datetime
|
|
|
|
#from tqdm.auto import tqdm
|
|
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
|
# print(out)
|
|
|
|
#%%
|
|
output_labels = []
|
|
output_score = []
|
|
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
|
output_labels.append(out['label'])
|
|
output_score.append(out['score'])
|
|
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
|
# Exactly the same output as before, but the content are passed
|
|
# as batches to the model
|
|
# %%
|
|
dfClassify['output_label_fake'] = output_labels
|
|
dfClassify['output_score_fake'] = output_score
|
|
|
|
timeEnd = datetime.now()
|
|
timeTotal = timeEnd - timeStart
|
|
timePerTweet = timeTotal / 96
|
|
|
|
print(f"Total classification execution time: {timeTotal} seconds")
|
|
print(f"Time per tweet classification: {timePerTweet}")
|
|
|
|
# %%
|
|
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
|
|
|
# %%
|