40 lines
1.0 KiB
Python
40 lines
1.0 KiB
Python
import re
|
|
import string
|
|
|
|
def remove_URL(text):
|
|
url = re.compile(r'https?://\S+|www\.\S+')
|
|
return url.sub(r'', text)
|
|
|
|
|
|
def remove_emoji(text):
|
|
emoji_pattern = re.compile(
|
|
'['
|
|
u'\U0001F600-\U0001F64F' # emoticons
|
|
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
|
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
|
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
|
u'\U00002702-\U000027B0'
|
|
u'\U000024C2-\U0001F251'
|
|
']+',
|
|
flags=re.UNICODE)
|
|
return emoji_pattern.sub(r'', text)
|
|
|
|
|
|
def remove_html(text):
|
|
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
|
return re.sub(html, '', text)
|
|
|
|
|
|
def remove_punct(text):
|
|
table = str.maketrans('', '', string.punctuation)
|
|
return text.translate(table)
|
|
|
|
def clean_all(text):
|
|
if not isinstance(text, str):
|
|
text = str(text) # Convert non-string values to string
|
|
text = remove_URL(text)
|
|
text = remove_emoji(text)
|
|
text = remove_html(text)
|
|
text = remove_punct(text)
|
|
return text
|