cleans and renames files

2023-08-30 21:18:55 +02:00
parent 4e08cde317
commit 1c6d9d5415
5 changed files with 29 additions and 282 deletions
--- a/Classification.py
+++ b/Classification.py
@@ -1,113 +0,0 @@
-import re
-import string
-import numpy as np
-import pandas as pd
-from datetime import datetime
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-from datasets import load_dataset
-from transformers.pipelines.pt_utils import KeyDataset
-from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
-
-
-#%%
-# prepare & define paths
-# install xformers (pip install xformers) for better performance
-###################
-# Setup directories
-# WD Michael
-wd = "/home/michael/Documents/PS/Data/collectTweets/"
-# WD Server
-# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
-
-# datafile input directory
-di = "data/IN/"
-
-# Tweet-datafile output directory
-ud = "data/OUT/"
-
-# Name of file that all senator data will be written to
-senCSV = "SenatorsTweets-OnlyCov.csv"
-
-# Name of Classify datafile
-senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
-senCSVClassifiedResult = "Tweets-Classified-Results.csv"
-
-# don't change this one
-senCSVPath = wd + ud + senCSV
-senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
-senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
-
-#%%
-# get datafra,e
-dfClassify = pd.read_csv(senCSVPath, dtype=(object))
-
-# dataframe from csv
-dfClassify['fake'] = False
-
-
-#%%
-# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
-# HowTo:
-# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
-# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
-pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
-model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
-tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
-
-# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
-
-dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
-dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
-dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
-dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
-dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
-
-#%%
-# remove empty rows
-dfClassify.cleanContent.replace('',np.nan,inplace=True)
-dfClassify.dropna(subset=['cleanContent'], inplace=True)
-
-#%%
-timeStart = datetime.now() # start counting execution time
-
-max_length = 128
-dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
-#train.rename(columns={'target': 'labels'}, inplace=True)
-#train.head()
-
-# %%
-dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
-
-#%%
-dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
-
-# %%from datetime import datetime
-
-#from tqdm.auto import tqdm
-#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
-#    print(out)
-
-#%% 
-output_labels = []
-output_score = []
-for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
-    output_labels.append(out['label'])
-    output_score.append(out['score'])
-    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-    # Exactly the same output as before, but the content are passed
-    # as batches to the model
-# %%
-dfClassify['output_label'] = output_labels
-dfClassify['output_score'] = output_score
-
-timeEnd = datetime.now()
-timeTotal = timeEnd - timeStart
-timePerTweet = timeTotal / 96
-
-print(f"Total classification execution time: {timeTotal} seconds")
-print(f"Time per tweet classification: {timePerTweet}")
-
-# %%
-dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
-
-# %%
--- a/analyze.py
+++ b/analyze.py
@@ -1,129 +0,0 @@
-import re
-import string
-import numpy as np
-import pandas as pd
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-from datasets import load_dataset
-from transformers.pipelines.pt_utils import KeyDataset
-from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
-
-
-#%%
-# prepare
-# install xformers (pip install xformers) for better performance
-###################
-# Setup directories
-# WD Michael
-wd = "/home/michael/Documents/PS/Data/collectTweets/"
-# WD Server
-# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
-
-# datafile input directory
-di = "data/IN/"
-
-# Tweet-datafile output directory
-ud = "data/OUT/"
-
-# Name of file that all senator data will be written to
-senCSV = "ALL-SENATORS-TWEETS.csv"
-
-# Name of new datafile generated
-senCSVc = "Tweets-Stub.csv"
-
-# Name of pretest files
-preTestIDsFake = "pretest-tweets_fake.txt"
-preTestIDsNot = "pretest-tweets_not_fake.txt"
-
-# Name of pretest datafile
-senCSVPretest = "Pretest.csv"
-senCSVPretestPrep = "Pretest-Prep.csv"
-senCSVPretestResult = "Pretest-Results.csv"
-
-
-# don't change this one
-senCSVPath = wd + ud + senCSV
-senCSVcPath = wd + ud + senCSVc
-senCSVcPretestPath = wd + ud + senCSVPretest
-senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
-senCSVcPretestResultPath = wd + ud + senCSVPretestResult
-preTestIDsFakePath = wd + di + preTestIDsFake
-preTestIDsNotPath = wd + di + preTestIDsNot
-
-# List of IDs to select
-# Read the IDs from a file
-preTestIDsFakeL = []
-preTestIDsNotL  = []
-with open(preTestIDsFakePath, "r") as file:
-    lines = file.readlines()
-    for line in lines:
-        tid = line.strip()  # Remove the newline character
-        preTestIDsFakeL.append(tid)
-with open(preTestIDsNotPath, "r") as file:
-    lines = file.readlines()
-    for line in lines:
-        tid = line.strip()  # Remove the newline character
-        preTestIDsNotL.append(tid)
-
-# Select rows based on the IDs
-df = pd.read_csv(senCSVPath, dtype=(object))
-#%%
-# Create pretest dataframe
-dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
-dfPreTest['fake'] = True
-dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
-dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
-
-#%%
-# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
-# HowTo:
-# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
-# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
-pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
-model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
-tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
-
-# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
-
-dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
-
-#%%
-max_length = 128
-dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
-#train.rename(columns={'target': 'labels'}, inplace=True)
-#train.head()
-
-# %%
-dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
-
-
-#%%
-dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
-
-# %%
-results = pipe(KeyDataset(dataset, "text"))
-# %%
-#from tqdm.auto import tqdm
-#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
-#    print(out)
-
-#%% 
-output_labels = []
-output_score = []
-for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
-    output_labels.append(out['label'])
-    output_score.append(out['score'])
-    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-    # Exactly the same output as before, but the content are passed
-    # as batches to the model
-# %%
-dfPreTest['output_label'] = output_labels
-dfPreTest['output_score'] = output_score
-
-# %%
-dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
-
-# %%
--- a/data/IN/keywords.txt
+++ b/data/IN/keywords.txt
@@ -18,44 +18,43 @@ socialdistancing
 wear a mask
 lockdown
 covd
-Coronavirus
-Koronavirus
-Corona
-CDC
-Wuhancoronavirus
-Wuhanlockdown
-Ncov
-Wuhan
-N95
-Kungflu
-Epidemic
+coronavirus
+koronavirus
+corona
+cdc
+wuhancoronavirus
+wuhanlockdown
+ncov
+wuhan
+n95
+kungflu
+epidemic
 outbreak
-Sinophobia
-China
+sinophobia
 covid-19
 corona virus
 covid
 covid19
 sars-cov-2
-COVIDー19
-COVD
+covidー19
+covd
 pandemic
 coronapocalypse
 canceleverything
-Coronials
-SocialDistancingNow
-Social Distancing
-SocialDistancing
+coronials
+socialdistancingnow
+social distancing
+socialdistancing
 panicbuy
 panic buy
 panicbuying
 panic buying
-14DayQuarantine
-DuringMy14DayQuarantine
+14dayquarantine
+duringmy14dayquarantine
 panic shop
 panic shopping
 panicshop
-InMyQuarantineSurvivalKit
+inmyquarantinesurvivalkit
 panic-buy
 panic-shop
 coronakindness
@@ -65,7 +64,7 @@ chinesevirus
 stayhomechallenge
 stay home challenge
 sflockdown
-DontBeASpreader
+dontbeaspreader
 lockdown
 lock down
 shelteringinplace
@@ -79,13 +78,13 @@ flatten the curve
 china virus
 chinavirus
 quarentinelife
-PPEshortage
+ppeshortage
 saferathome
 stayathome
 stay at home
 stay home
 stayhome
-GetMePPE
+getmeppe
 covidiot
 epitwitter
 pandemie
@@ -93,7 +92,7 @@ wear a mask
 wearamask
 kung flu
 covididiot
-COVID__19
+covid__19
 omicron
 variant
 vaccine
@@ -139,9 +138,7 @@ work from home
 workfromhome
 working from home
 workingfromhome
-ppe
 n95
-ppe
 n95
 covidiots
 covidiots
--- a/trainFake.py
+++ b/trainFake.py
@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn

 import pandas as pd

-## Follow these two guides:
-# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
-# https://xiangyutang2.github.io/tweet-classification/
-# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/

 ###################
 # Setup directories
--- a/trainTopic.py
+++ b/trainTopic.py
@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn

 import pandas as pd

-## Follow these two guides:
-# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
-# https://xiangyutang2.github.io/tweet-classification/
-# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
+## Uses snippets from this guide:
+# https://mccormickml.com/2019/07/22/BERT-fine-tuning/

 ###################
 # Setup directories
@@ -65,11 +63,7 @@ seed = 12355
 modCovClassPath = wd + "models/CovClass/"
 modFakeClassPath = wd + "models/FakeClass/"

-model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
-#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
-#model_name = "cardiffnlp/tweet-topic-latest-multi"
 model_name = "bvrau/covid-twitter-bert-v2-struth"
-#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
 model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 

 # More models for fake detection: