adds both classification scripts. Corrects inclusion of CleanTweets functions.
This commit is contained in:
		
							
								
								
									
										116
									
								
								ClassificationFake.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										116
									
								
								ClassificationFake.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,116 @@
 | 
			
		||||
import re
 | 
			
		||||
import string
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 | 
			
		||||
from datasets import load_dataset
 | 
			
		||||
from transformers.pipelines.pt_utils import KeyDataset
 | 
			
		||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# prepare & define paths
 | 
			
		||||
# install xformers (pip install xformers) for better performance
 | 
			
		||||
###################
 | 
			
		||||
# Setup directories
 | 
			
		||||
# WD Michael
 | 
			
		||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
 | 
			
		||||
# WD Server
 | 
			
		||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 | 
			
		||||
 | 
			
		||||
# datafile input directory
 | 
			
		||||
di = "data/IN/"
 | 
			
		||||
 | 
			
		||||
# Tweet-datafile output directory
 | 
			
		||||
ud = "data/OUT/"
 | 
			
		||||
 | 
			
		||||
# Name of file that all senator data will be written to
 | 
			
		||||
senCSV = "SenatorsTweets-OnlyCov.csv"
 | 
			
		||||
 | 
			
		||||
# Name of Classify datafile
 | 
			
		||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
 | 
			
		||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
 | 
			
		||||
 | 
			
		||||
# don't change this one
 | 
			
		||||
senCSVPath = wd + ud + senCSV
 | 
			
		||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
 | 
			
		||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
 | 
			
		||||
 | 
			
		||||
import sys
 | 
			
		||||
funs = wd+"funs"
 | 
			
		||||
sys.path.insert(1, funs)
 | 
			
		||||
import CleanTweets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# get datafra,e
 | 
			
		||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
 | 
			
		||||
 | 
			
		||||
# dataframe from csv
 | 
			
		||||
dfClassify['fake'] = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 | 
			
		||||
# HowTo:
 | 
			
		||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 | 
			
		||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 | 
			
		||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
 | 
			
		||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 | 
			
		||||
 | 
			
		||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 | 
			
		||||
 | 
			
		||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# remove empty rows
 | 
			
		||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
 | 
			
		||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
timeStart = datetime.now() # start counting execution time
 | 
			
		||||
 | 
			
		||||
max_length = 128
 | 
			
		||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 | 
			
		||||
#train.rename(columns={'target': 'labels'}, inplace=True)
 | 
			
		||||
#train.head()
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
 | 
			
		||||
 | 
			
		||||
# %%from datetime import datetime
 | 
			
		||||
 | 
			
		||||
#from tqdm.auto import tqdm
 | 
			
		||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 | 
			
		||||
#    print(out)
 | 
			
		||||
 | 
			
		||||
#%% 
 | 
			
		||||
output_labels = []
 | 
			
		||||
output_score = []
 | 
			
		||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
 | 
			
		||||
    output_labels.append(out['label'])
 | 
			
		||||
    output_score.append(out['score'])
 | 
			
		||||
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
 | 
			
		||||
    # Exactly the same output as before, but the content are passed
 | 
			
		||||
    # as batches to the model
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify['output_label'] = output_labels
 | 
			
		||||
dfClassify['output_score'] = output_score
 | 
			
		||||
 | 
			
		||||
timeEnd = datetime.now()
 | 
			
		||||
timeTotal = timeEnd - timeStart
 | 
			
		||||
timePerTweet = timeTotal / 96
 | 
			
		||||
 | 
			
		||||
print(f"Total classification execution time: {timeTotal} seconds")
 | 
			
		||||
print(f"Time per tweet classification: {timePerTweet}")
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
							
								
								
									
										115
									
								
								ClassificationTopic.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								ClassificationTopic.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
import re
 | 
			
		||||
import string
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 | 
			
		||||
from datasets import load_dataset
 | 
			
		||||
from transformers.pipelines.pt_utils import KeyDataset
 | 
			
		||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# prepare & define paths
 | 
			
		||||
# install xformers (pip install xformers) for better performance
 | 
			
		||||
###################
 | 
			
		||||
# Setup directories
 | 
			
		||||
# WD Michael
 | 
			
		||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
 | 
			
		||||
# WD Server
 | 
			
		||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 | 
			
		||||
 | 
			
		||||
# datafile input directory
 | 
			
		||||
di = "data/IN/"
 | 
			
		||||
 | 
			
		||||
# Tweet-datafile output directory
 | 
			
		||||
ud = "data/OUT/"
 | 
			
		||||
 | 
			
		||||
# Name of file that all senator data will be written to
 | 
			
		||||
senCSV = "SenatorsTweets-OnlyCov.csv"
 | 
			
		||||
 | 
			
		||||
# Name of Classify datafile
 | 
			
		||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
 | 
			
		||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
 | 
			
		||||
 | 
			
		||||
# don't change this one
 | 
			
		||||
senCSVPath = wd + ud + senCSV
 | 
			
		||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
 | 
			
		||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
 | 
			
		||||
 | 
			
		||||
import sys
 | 
			
		||||
funs = wd+"funs"
 | 
			
		||||
sys.path.insert(1, funs)
 | 
			
		||||
import CleanTweets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# get datafra,e
 | 
			
		||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
 | 
			
		||||
 | 
			
		||||
# dataframe from csv
 | 
			
		||||
dfClassify['fake'] = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 | 
			
		||||
# HowTo:
 | 
			
		||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 | 
			
		||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 | 
			
		||||
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
 | 
			
		||||
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
 | 
			
		||||
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
 | 
			
		||||
 | 
			
		||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 | 
			
		||||
 | 
			
		||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
# remove empty rows
 | 
			
		||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
 | 
			
		||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
timeStart = datetime.now() # start counting execution time
 | 
			
		||||
 | 
			
		||||
max_length = 128
 | 
			
		||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 | 
			
		||||
#train.rename(columns={'target': 'labels'}, inplace=True)
 | 
			
		||||
#train.head()
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 | 
			
		||||
 | 
			
		||||
#%%
 | 
			
		||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
 | 
			
		||||
 | 
			
		||||
# %%from datetime import datetime
 | 
			
		||||
 | 
			
		||||
#from tqdm.auto import tqdm
 | 
			
		||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 | 
			
		||||
#    print(out)
 | 
			
		||||
 | 
			
		||||
#%% 
 | 
			
		||||
output_labels = []
 | 
			
		||||
output_score = []
 | 
			
		||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
 | 
			
		||||
    output_labels.append(out['label'])
 | 
			
		||||
    output_score.append(out['score'])
 | 
			
		||||
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
 | 
			
		||||
    # Exactly the same output as before, but the content are passed
 | 
			
		||||
    # as batches to the model
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify['output_label'] = output_labels
 | 
			
		||||
dfClassify['output_score'] = output_score
 | 
			
		||||
 | 
			
		||||
timeEnd = datetime.now()
 | 
			
		||||
timeTotal = timeEnd - timeStart
 | 
			
		||||
timePerTweet = timeTotal / 96
 | 
			
		||||
 | 
			
		||||
print(f"Total classification execution time: {timeTotal} seconds")
 | 
			
		||||
print(f"Time per tweet classification: {timePerTweet}")
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 | 
			
		||||
 | 
			
		||||
# %%
 | 
			
		||||
		Reference in New Issue
	
	Block a user