cleans and renames files
This commit is contained in:
parent
4e08cde317
commit
1c6d9d5415
@ -1,113 +0,0 @@
|
||||
import re
|
||||
import string
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||
|
||||
|
||||
#%%
|
||||
# prepare & define paths
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
||||
|
||||
# Name of Classify datafile
|
||||
senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
|
||||
senCSVClassifiedResult = "Tweets-Classified-Results.csv"
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||
|
||||
#%%
|
||||
# get datafra,e
|
||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
# dataframe from csv
|
||||
dfClassify['fake'] = False
|
||||
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
|
||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
|
||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
|
||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
|
||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
|
||||
|
||||
#%%
|
||||
# remove empty rows
|
||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||
|
||||
#%%
|
||||
timeStart = datetime.now() # start counting execution time
|
||||
|
||||
max_length = 128
|
||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||
|
||||
# %%from datetime import datetime
|
||||
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfClassify['output_label'] = output_labels
|
||||
dfClassify['output_score'] = output_score
|
||||
|
||||
timeEnd = datetime.now()
|
||||
timeTotal = timeEnd - timeStart
|
||||
timePerTweet = timeTotal / 96
|
||||
|
||||
print(f"Total classification execution time: {timeTotal} seconds")
|
||||
print(f"Time per tweet classification: {timePerTweet}")
|
||||
|
||||
# %%
|
||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
129
analyze.py
129
analyze.py
@ -1,129 +0,0 @@
|
||||
import re
|
||||
import string
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||
|
||||
|
||||
#%%
|
||||
# prepare
|
||||
# install xformers (pip install xformers) for better performance
|
||||
###################
|
||||
# Setup directories
|
||||
# WD Michael
|
||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||
# WD Server
|
||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||
|
||||
# datafile input directory
|
||||
di = "data/IN/"
|
||||
|
||||
# Tweet-datafile output directory
|
||||
ud = "data/OUT/"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "Tweets-Stub.csv"
|
||||
|
||||
# Name of pretest files
|
||||
preTestIDsFake = "pretest-tweets_fake.txt"
|
||||
preTestIDsNot = "pretest-tweets_not_fake.txt"
|
||||
|
||||
# Name of pretest datafile
|
||||
senCSVPretest = "Pretest.csv"
|
||||
senCSVPretestPrep = "Pretest-Prep.csv"
|
||||
senCSVPretestResult = "Pretest-Results.csv"
|
||||
|
||||
|
||||
# don't change this one
|
||||
senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSVc
|
||||
senCSVcPretestPath = wd + ud + senCSVPretest
|
||||
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
|
||||
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
||||
preTestIDsFakePath = wd + di + preTestIDsFake
|
||||
preTestIDsNotPath = wd + di + preTestIDsNot
|
||||
|
||||
# List of IDs to select
|
||||
# Read the IDs from a file
|
||||
preTestIDsFakeL = []
|
||||
preTestIDsNotL = []
|
||||
with open(preTestIDsFakePath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsFakeL.append(tid)
|
||||
with open(preTestIDsNotPath, "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
tid = line.strip() # Remove the newline character
|
||||
preTestIDsNotL.append(tid)
|
||||
|
||||
# Select rows based on the IDs
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
#%%
|
||||
# Create pretest dataframe
|
||||
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
|
||||
dfPreTest['fake'] = True
|
||||
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
|
||||
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
|
||||
|
||||
#%%
|
||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||
# HowTo:
|
||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
|
||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
|
||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
|
||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
|
||||
|
||||
#%%
|
||||
max_length = 128
|
||||
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||
#train.head()
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||
|
||||
|
||||
#%%
|
||||
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
|
||||
|
||||
# %%
|
||||
results = pipe(KeyDataset(dataset, "text"))
|
||||
# %%
|
||||
#from tqdm.auto import tqdm
|
||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||
# print(out)
|
||||
|
||||
#%%
|
||||
output_labels = []
|
||||
output_score = []
|
||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||
output_labels.append(out['label'])
|
||||
output_score.append(out['score'])
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
# %%
|
||||
dfPreTest['output_label'] = output_labels
|
||||
dfPreTest['output_score'] = output_score
|
||||
|
||||
# %%
|
||||
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
|
||||
|
||||
# %%
|
@ -18,44 +18,43 @@ socialdistancing
|
||||
wear a mask
|
||||
lockdown
|
||||
covd
|
||||
Coronavirus
|
||||
Koronavirus
|
||||
Corona
|
||||
CDC
|
||||
Wuhancoronavirus
|
||||
Wuhanlockdown
|
||||
Ncov
|
||||
Wuhan
|
||||
N95
|
||||
Kungflu
|
||||
Epidemic
|
||||
coronavirus
|
||||
koronavirus
|
||||
corona
|
||||
cdc
|
||||
wuhancoronavirus
|
||||
wuhanlockdown
|
||||
ncov
|
||||
wuhan
|
||||
n95
|
||||
kungflu
|
||||
epidemic
|
||||
outbreak
|
||||
Sinophobia
|
||||
China
|
||||
sinophobia
|
||||
covid-19
|
||||
corona virus
|
||||
covid
|
||||
covid19
|
||||
sars-cov-2
|
||||
COVIDー19
|
||||
COVD
|
||||
covidー19
|
||||
covd
|
||||
pandemic
|
||||
coronapocalypse
|
||||
canceleverything
|
||||
Coronials
|
||||
SocialDistancingNow
|
||||
Social Distancing
|
||||
SocialDistancing
|
||||
coronials
|
||||
socialdistancingnow
|
||||
social distancing
|
||||
socialdistancing
|
||||
panicbuy
|
||||
panic buy
|
||||
panicbuying
|
||||
panic buying
|
||||
14DayQuarantine
|
||||
DuringMy14DayQuarantine
|
||||
14dayquarantine
|
||||
duringmy14dayquarantine
|
||||
panic shop
|
||||
panic shopping
|
||||
panicshop
|
||||
InMyQuarantineSurvivalKit
|
||||
inmyquarantinesurvivalkit
|
||||
panic-buy
|
||||
panic-shop
|
||||
coronakindness
|
||||
@ -65,7 +64,7 @@ chinesevirus
|
||||
stayhomechallenge
|
||||
stay home challenge
|
||||
sflockdown
|
||||
DontBeASpreader
|
||||
dontbeaspreader
|
||||
lockdown
|
||||
lock down
|
||||
shelteringinplace
|
||||
@ -79,13 +78,13 @@ flatten the curve
|
||||
china virus
|
||||
chinavirus
|
||||
quarentinelife
|
||||
PPEshortage
|
||||
ppeshortage
|
||||
saferathome
|
||||
stayathome
|
||||
stay at home
|
||||
stay home
|
||||
stayhome
|
||||
GetMePPE
|
||||
getmeppe
|
||||
covidiot
|
||||
epitwitter
|
||||
pandemie
|
||||
@ -93,7 +92,7 @@ wear a mask
|
||||
wearamask
|
||||
kung flu
|
||||
covididiot
|
||||
COVID__19
|
||||
covid__19
|
||||
omicron
|
||||
variant
|
||||
vaccine
|
||||
@ -139,9 +138,7 @@ work from home
|
||||
workfromhome
|
||||
working from home
|
||||
workingfromhome
|
||||
ppe
|
||||
n95
|
||||
ppe
|
||||
n95
|
||||
covidiots
|
||||
covidiots
|
||||
|
@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Follow these two guides:
|
||||
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
# https://xiangyutang2.github.io/tweet-classification/
|
||||
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
|
@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
|
||||
|
||||
import pandas as pd
|
||||
|
||||
## Follow these two guides:
|
||||
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
# https://xiangyutang2.github.io/tweet-classification/
|
||||
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
|
||||
## Uses snippets from this guide:
|
||||
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||
|
||||
###################
|
||||
# Setup directories
|
||||
@ -65,11 +63,7 @@ seed = 12355
|
||||
modCovClassPath = wd + "models/CovClass/"
|
||||
modFakeClassPath = wd + "models/FakeClass/"
|
||||
|
||||
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
|
||||
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
|
||||
#model_name = "cardiffnlp/tweet-topic-latest-multi"
|
||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
|
||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||
|
||||
# More models for fake detection:
|
Loading…
x
Reference in New Issue
Block a user