import re import string def remove_URL(text): url = re.compile(r'https?://\S+|www\.\S+') return url.sub(r'', text) def remove_emoji(text): emoji_pattern = re.compile( '[' u'\U0001F600-\U0001F64F' # emoticons u'\U0001F300-\U0001F5FF' # symbols & pictographs u'\U0001F680-\U0001F6FF' # transport & map symbols u'\U0001F1E0-\U0001F1FF' # flags (iOS) u'\U00002702-\U000027B0' u'\U000024C2-\U0001F251' ']+', flags=re.UNICODE) return emoji_pattern.sub(r'', text) def remove_html(text): html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') return re.sub(html, '', text) def remove_punct(text): table = str.maketrans('', '', string.punctuation) return text.translate(table) def clean_all(text): if not isinstance(text, str): text = str(text) # Convert non-string values to string text = remove_URL(text) text = remove_emoji(text) text = remove_html(text) text = remove_punct(text) return text