CollectUSSenatorTweets/funs/CleanTweets.py

import re
import string

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_nonascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def remove_spec(text):
    text = re.sub(r'&amp;?', r'and', text)
    text = re.sub(r'&lt;', r'<', text)
    return re.sub(r'&gt;', r'>', text)

def remove_spaces(text): # also new line chars and to lower case
    text = re.sub(r'&lt;', r'<', text)
    text = " ".join(text.splitlines()) # remove newline characters
    text = text.lower()
    text = text.strip()
    return re.sub(r'\s{2,}', ' ', text)

def remove_retw(text):
    text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
    return re.sub(r'@[\S]+', '', text)

def preprocess_text(text):
    text = remove_URL(text)
    text = remove_emoji(text)
    text = remove_html(text)
    text = remove_punct(text)
    text = remove_nonascii(text)
    text = remove_spec(text)
    text = remove_spaces(text)
    text = remove_retw(text)
    return text

def preprocess_text_series(series):
    series = series.apply(remove_URL)
    series = series.apply(remove_emoji)
    series = series.apply(remove_html)
    series = series.apply(remove_punct)
    series = series.apply(remove_nonascii)
    series = series.apply(remove_spec)
    series = series.apply(remove_spaces)
    series = series.apply(remove_retw)
    return series

# Check all functions:
input_text = """
    Check out this amazing website: https://www.example.com! 😃
    <html>This is an HTML tag.</html>
    RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
    This is a test text with lots of punctuations!!! Can't wait to see more...
"""
processed_text = preprocess_text(input_text)
# print(processed_text)