adds CleanTweets functions, creates Graphs

2023-07-07 18:18:51 +02:00
parent 817ec48478
commit 899a99ba72
4 changed files with 201 additions and 42 deletions
--- a/analyze.py
+++ b/analyze.py
@@ -5,6 +5,7 @@ import pandas as pd
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
+from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct


 #%%
@@ -82,33 +83,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-
 tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")

 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
-def remove_URL(text):
-    url = re.compile(r'https?://\S+|www\.\S+')
-    return url.sub(r'', text)
-
-
-def remove_emoji(text):
-    emoji_pattern = re.compile(
-        '['
-        u'\U0001F600-\U0001F64F'  # emoticons
-        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
-        u'\U0001F680-\U0001F6FF'  # transport & map symbols
-        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
-        u'\U00002702-\U000027B0'
-        u'\U000024C2-\U0001F251'
-        ']+',
-        flags=re.UNICODE)
-    return emoji_pattern.sub(r'', text)
-
-
-def remove_html(text):
-    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
-    return re.sub(html, '', text)
-
-
-def remove_punct(text):
-    table = str.maketrans('', '', string.punctuation)
-    return text.translate(table)

 dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)