diff --git a/funs/CleanTweets.py b/funs/CleanTweets.py index 18ee6c1..9aa0f3d 100644 --- a/funs/CleanTweets.py +++ b/funs/CleanTweets.py @@ -1,8 +1,19 @@ import re import string +def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022 + preprocessed_text = [] + for t in text.split(): + if len(t) > 1: + t = '@user' if t[0] == '@' and t.count('@') == 1 else t + t = 'http' if t.startswith('http') else t + preprocessed_text.append(t) + return ' '.join(preprocessed_text) + def remove_URL(text): - url = re.compile(r'https?://\S+|www\.\S+') + try: + url = re.compile(r'https?://\S+|www\.\S+') + except: print(text) return url.sub(r'', text) def remove_emoji(text):