From b89b5969ec773af4f5a709b42e47569700eb50c7 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Tue, 15 Aug 2023 14:19:33 +0200 Subject: [PATCH] adds typerror controls --- funs/CleanTweets.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/funs/CleanTweets.py b/funs/CleanTweets.py index 18ee6c1..9aa0f3d 100644 --- a/funs/CleanTweets.py +++ b/funs/CleanTweets.py @@ -1,8 +1,19 @@ import re import string +def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022 + preprocessed_text = [] + for t in text.split(): + if len(t) > 1: + t = '@user' if t[0] == '@' and t.count('@') == 1 else t + t = 'http' if t.startswith('http') else t + preprocessed_text.append(t) + return ' '.join(preprocessed_text) + def remove_URL(text): - url = re.compile(r'https?://\S+|www\.\S+') + try: + url = re.compile(r'https?://\S+|www\.\S+') + except: print(text) return url.sub(r'', text) def remove_emoji(text):