cleans and renames files

This commit is contained in:
Michael Beck 2023-08-30 21:18:55 +02:00
parent 4e08cde317
commit 1c6d9d5415
5 changed files with 29 additions and 282 deletions

View File

@ -1,113 +0,0 @@
import re
import string
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%%
# prepare & define paths
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv"
# Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
#%%
# get datafra,e
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
# dataframe from csv
dfClassify['fake'] = False
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
#%%
# remove empty rows
dfClassify.cleanContent.replace('',np.nan,inplace=True)
dfClassify.dropna(subset=['cleanContent'], inplace=True)
#%%
timeStart = datetime.now() # start counting execution time
max_length = 128
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
# %%from datetime import datetime
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfClassify['output_label'] = output_labels
dfClassify['output_score'] = output_score
timeEnd = datetime.now()
timeTotal = timeEnd - timeStart
timePerTweet = timeTotal / 96
print(f"Total classification execution time: {timeTotal} seconds")
print(f"Time per tweet classification: {timePerTweet}")
# %%
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
# %%

View File

@ -1,129 +0,0 @@
import re
import string
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%%
# prepare
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS-TWEETS.csv"
# Name of new datafile generated
senCSVc = "Tweets-Stub.csv"
# Name of pretest files
preTestIDsFake = "pretest-tweets_fake.txt"
preTestIDsNot = "pretest-tweets_not_fake.txt"
# Name of pretest datafile
senCSVPretest = "Pretest.csv"
senCSVPretestPrep = "Pretest-Prep.csv"
senCSVPretestResult = "Pretest-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc
senCSVcPretestPath = wd + ud + senCSVPretest
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
preTestIDsFakePath = wd + di + preTestIDsFake
preTestIDsNotPath = wd + di + preTestIDsNot
# List of IDs to select
# Read the IDs from a file
preTestIDsFakeL = []
preTestIDsNotL = []
with open(preTestIDsFakePath, "r") as file:
lines = file.readlines()
for line in lines:
tid = line.strip() # Remove the newline character
preTestIDsFakeL.append(tid)
with open(preTestIDsNotPath, "r") as file:
lines = file.readlines()
for line in lines:
tid = line.strip() # Remove the newline character
preTestIDsNotL.append(tid)
# Select rows based on the IDs
df = pd.read_csv(senCSVPath, dtype=(object))
#%%
# Create pretest dataframe
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
dfPreTest['fake'] = True
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
#%%
max_length = 128
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
# %%
results = pipe(KeyDataset(dataset, "text"))
# %%
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfPreTest['output_label'] = output_labels
dfPreTest['output_score'] = output_score
# %%
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
# %%

View File

@ -18,44 +18,43 @@ socialdistancing
wear a mask wear a mask
lockdown lockdown
covd covd
Coronavirus coronavirus
Koronavirus koronavirus
Corona corona
CDC cdc
Wuhancoronavirus wuhancoronavirus
Wuhanlockdown wuhanlockdown
Ncov ncov
Wuhan wuhan
N95 n95
Kungflu kungflu
Epidemic epidemic
outbreak outbreak
Sinophobia sinophobia
China
covid-19 covid-19
corona virus corona virus
covid covid
covid19 covid19
sars-cov-2 sars-cov-2
COVIDー19 covidー19
COVD covd
pandemic pandemic
coronapocalypse coronapocalypse
canceleverything canceleverything
Coronials coronials
SocialDistancingNow socialdistancingnow
Social Distancing social distancing
SocialDistancing socialdistancing
panicbuy panicbuy
panic buy panic buy
panicbuying panicbuying
panic buying panic buying
14DayQuarantine 14dayquarantine
DuringMy14DayQuarantine duringmy14dayquarantine
panic shop panic shop
panic shopping panic shopping
panicshop panicshop
InMyQuarantineSurvivalKit inmyquarantinesurvivalkit
panic-buy panic-buy
panic-shop panic-shop
coronakindness coronakindness
@ -65,7 +64,7 @@ chinesevirus
stayhomechallenge stayhomechallenge
stay home challenge stay home challenge
sflockdown sflockdown
DontBeASpreader dontbeaspreader
lockdown lockdown
lock down lock down
shelteringinplace shelteringinplace
@ -79,13 +78,13 @@ flatten the curve
china virus china virus
chinavirus chinavirus
quarentinelife quarentinelife
PPEshortage ppeshortage
saferathome saferathome
stayathome stayathome
stay at home stay at home
stay home stay home
stayhome stayhome
GetMePPE getmeppe
covidiot covidiot
epitwitter epitwitter
pandemie pandemie
@ -93,7 +92,7 @@ wear a mask
wearamask wearamask
kung flu kung flu
covididiot covididiot
COVID__19 covid__19
omicron omicron
variant variant
vaccine vaccine
@ -139,9 +138,7 @@ work from home
workfromhome workfromhome
working from home working from home
workingfromhome workingfromhome
ppe
n95 n95
ppe
n95 n95
covidiots covidiots
covidiots covidiots

View File

@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd import pandas as pd
## Follow these two guides: ## Uses snippets from this guide:
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
# https://xiangyutang2.github.io/tweet-classification/
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
################### ###################
# Setup directories # Setup directories

View File

@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd import pandas as pd
## Follow these two guides: ## Uses snippets from this guide:
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
# https://xiangyutang2.github.io/tweet-classification/
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
################### ###################
# Setup directories # Setup directories
@ -65,11 +63,7 @@ seed = 12355
modCovClassPath = wd + "models/CovClass/" modCovClassPath = wd + "models/CovClass/"
modFakeClassPath = wd + "models/FakeClass/" modFakeClassPath = wd + "models/FakeClass/"
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
#model_name = "cardiffnlp/tweet-topic-latest-multi"
model_name = "bvrau/covid-twitter-bert-v2-struth" model_name = "bvrau/covid-twitter-bert-v2-struth"
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
# More models for fake detection: # More models for fake detection: