adds both classification scripts. Corrects inclusion of CleanTweets functions.

2023-08-15 14:23:56 +02:00
parent 7a16526a97
commit 2e067b6a64
2 changed files with 231 additions and 0 deletions
--- a/ClassificationFake.py
+++ b/ClassificationFake.py
@@ -0,0 +1,116 @@
+import re
+import string
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
+model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label'] = output_labels
+dfClassify['output_score'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%
--- a/ClassificationTopic.py
+++ b/ClassificationTopic.py
@@ -0,0 +1,115 @@
+import re
+import string
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+from datasets import load_dataset
+from transformers.pipelines.pt_utils import KeyDataset
+from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
+
+
+#%%
+# prepare & define paths
+# install xformers (pip install xformers) for better performance
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "SenatorsTweets-OnlyCov.csv"
+
+# Name of Classify datafile
+senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
+senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
+
+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
+
+#%%
+# get datafra,e
+dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+
+# dataframe from csv
+dfClassify['fake'] = False
+
+
+#%%
+# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
+# HowTo:
+# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
+# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
+
+# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
+
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
+
+#%%
+# remove empty rows
+dfClassify.cleanContent.replace('',np.nan,inplace=True)
+dfClassify.dropna(subset=['cleanContent'], inplace=True)
+
+#%%
+timeStart = datetime.now() # start counting execution time
+
+max_length = 128
+dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
+#train.rename(columns={'target': 'labels'}, inplace=True)
+#train.head()
+
+# %%
+dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
+
+#%%
+dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
+
+# %%from datetime import datetime
+
+#from tqdm.auto import tqdm
+#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
+#    print(out)
+
+#%% 
+output_labels = []
+output_score = []
+for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
+    output_labels.append(out['label'])
+    output_score.append(out['score'])
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+# %%
+dfClassify['output_label'] = output_labels
+dfClassify['output_score'] = output_score
+
+timeEnd = datetime.now()
+timeTotal = timeEnd - timeStart
+timePerTweet = timeTotal / 96
+
+print(f"Total classification execution time: {timeTotal} seconds")
+print(f"Time per tweet classification: {timePerTweet}")
+
+# %%
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+
+# %%