diff --git a/ClassificationFake.py b/ClassificationFake.py new file mode 100644 index 0000000..db99b2b --- /dev/null +++ b/ClassificationFake.py @@ -0,0 +1,116 @@ +import re +import string +import numpy as np +import pandas as pd +from datetime import datetime +from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline +from datasets import load_dataset +from transformers.pipelines.pt_utils import KeyDataset +from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct + + +#%% +# prepare & define paths +# install xformers (pip install xformers) for better performance +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "SenatorsTweets-OnlyCov.csv" + +# Name of Classify datafile +senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" +senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" + +# don't change this one +senCSVPath = wd + ud + senCSV +senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep +senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult + +import sys +funs = wd+"funs" +sys.path.insert(1, funs) +import CleanTweets + + +#%% +# get datafra,e +dfClassify = pd.read_csv(senCSVPath, dtype=(object)) + +# dataframe from csv +dfClassify['fake'] = False + + +#%% +# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth +# HowTo: +# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification +# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline +pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") +model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") +tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") + +# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert + +dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text) + + +#%% +# remove empty rows +dfClassify.cleanContent.replace('',np.nan,inplace=True) +dfClassify.dropna(subset=['cleanContent'], inplace=True) + +#%% +timeStart = datetime.now() # start counting execution time + +max_length = 128 +dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) +#train.rename(columns={'target': 'labels'}, inplace=True) +#train.head() + +# %% +dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent']) + +#%% +dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath) + +# %%from datetime import datetime + +#from tqdm.auto import tqdm +#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))): +# print(out) + +#%% +output_labels = [] +output_score = [] +for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"): + output_labels.append(out['label']) + output_score.append(out['score']) + # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] + # Exactly the same output as before, but the content are passed + # as batches to the model +# %% +dfClassify['output_label'] = output_labels +dfClassify['output_score'] = output_score + +timeEnd = datetime.now() +timeTotal = timeEnd - timeStart +timePerTweet = timeTotal / 96 + +print(f"Total classification execution time: {timeTotal} seconds") +print(f"Time per tweet classification: {timePerTweet}") + +# %% +dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') + +# %% diff --git a/ClassificationTopic.py b/ClassificationTopic.py new file mode 100644 index 0000000..e796a45 --- /dev/null +++ b/ClassificationTopic.py @@ -0,0 +1,115 @@ +import re +import string +import numpy as np +import pandas as pd +from datetime import datetime +from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline +from datasets import load_dataset +from transformers.pipelines.pt_utils import KeyDataset +from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct + + +#%% +# prepare & define paths +# install xformers (pip install xformers) for better performance +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "SenatorsTweets-OnlyCov.csv" + +# Name of Classify datafile +senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" +senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" + +# don't change this one +senCSVPath = wd + ud + senCSV +senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep +senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult + +import sys +funs = wd+"funs" +sys.path.insert(1, funs) +import CleanTweets + + +#%% +# get datafra,e +dfClassify = pd.read_csv(senCSVPath, dtype=(object)) + +# dataframe from csv +dfClassify['fake'] = False + + +#%% +# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth +# HowTo: +# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification +# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline +pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") +model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") +tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/") + +# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert + +dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text) + +#%% +# remove empty rows +dfClassify.cleanContent.replace('',np.nan,inplace=True) +dfClassify.dropna(subset=['cleanContent'], inplace=True) + +#%% +timeStart = datetime.now() # start counting execution time + +max_length = 128 +dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) +#train.rename(columns={'target': 'labels'}, inplace=True) +#train.head() + +# %% +dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent']) + +#%% +dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath) + +# %%from datetime import datetime + +#from tqdm.auto import tqdm +#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))): +# print(out) + +#%% +output_labels = [] +output_score = [] +for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"): + output_labels.append(out['label']) + output_score.append(out['score']) + # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] + # Exactly the same output as before, but the content are passed + # as batches to the model +# %% +dfClassify['output_label'] = output_labels +dfClassify['output_score'] = output_score + +timeEnd = datetime.now() +timeTotal = timeEnd - timeStart +timePerTweet = timeTotal / 96 + +print(f"Total classification execution time: {timeTotal} seconds") +print(f"Time per tweet classification: {timePerTweet}") + +# %% +dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') + +# %%