import re import string def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022 preprocessed_text = [] for t in text.split(): if len(t) > 1: t = '@user' if t[0] == '@' and t.count('@') == 1 else t t = 'http' if t.startswith('http') else t preprocessed_text.append(t) return ' '.join(preprocessed_text) def remove_URL(text): try: url = re.compile(r'https?://\S+|www\.\S+') except: print(text) return url.sub(r'', text) def remove_emoji(text): emoji_pattern = re.compile( '[' u'\U0001F600-\U0001F64F' # emoticons u'\U0001F300-\U0001F5FF' # symbols & pictographs u'\U0001F680-\U0001F6FF' # transport & map symbols u'\U0001F1E0-\U0001F1FF' # flags (iOS) u'\U00002702-\U000027B0' u'\U000024C2-\U0001F251' ']+', flags=re.UNICODE) return emoji_pattern.sub(r'', text) def remove_html(text): html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') return re.sub(html, '', text) def remove_punct(text): table = str.maketrans('', '', string.punctuation) return text.translate(table) def remove_nonascii(text): return re.sub(r'[^\x00-\x7F]+', '', text) def remove_spec(text): text = re.sub(r'&?', r'and', text) text = re.sub(r'<', r'<', text) return re.sub(r'>', r'>', text) def remove_spaces(text): # also new line chars and to lower case text = re.sub(r'<', r'<', text) text = " ".join(text.splitlines()) # remove newline characters text = text.lower() text = text.strip() return re.sub(r'\s{2,}', ' ', text) def remove_retw(text): text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text) return re.sub(r'@[\S]+', '', text) def preprocess_text(text): text = remove_URL(text) text = remove_emoji(text) text = remove_html(text) text = remove_punct(text) text = remove_nonascii(text) text = remove_spec(text) text = remove_spaces(text) text = remove_retw(text) return text def preprocess_text_series(series): series = series.apply(remove_URL) series = series.apply(remove_emoji) series = series.apply(remove_html) series = series.apply(remove_punct) series = series.apply(remove_nonascii) series = series.apply(remove_spec) series = series.apply(remove_spaces) series = series.apply(remove_retw) return series # Check all functions: input_text = """ Check out this amazing website: https://www.example.com! 😃 This is an HTML tag. RT @user123: Just received a package from @companyXYZ. It's awesome! 📦 This is a test text with lots of punctuations!!! Can't wait to see more... """ processed_text = preprocess_text(input_text) # print(processed_text)