40 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			40 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import re
 | |
| import string
 | |
| 
 | |
| def remove_URL(text):
 | |
|     url = re.compile(r'https?://\S+|www\.\S+')
 | |
|     return url.sub(r'', text)
 | |
| 
 | |
| 
 | |
| def remove_emoji(text):
 | |
|     emoji_pattern = re.compile(
 | |
|         '['
 | |
|         u'\U0001F600-\U0001F64F'  # emoticons
 | |
|         u'\U0001F300-\U0001F5FF'  # symbols & pictographs
 | |
|         u'\U0001F680-\U0001F6FF'  # transport & map symbols
 | |
|         u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
 | |
|         u'\U00002702-\U000027B0'
 | |
|         u'\U000024C2-\U0001F251'
 | |
|         ']+',
 | |
|         flags=re.UNICODE)
 | |
|     return emoji_pattern.sub(r'', text)
 | |
| 
 | |
| 
 | |
| def remove_html(text):
 | |
|     html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
 | |
|     return re.sub(html, '', text)
 | |
| 
 | |
| 
 | |
| def remove_punct(text):
 | |
|     table = str.maketrans('', '', string.punctuation)
 | |
|     return text.translate(table)
 | |
| 
 | |
| def clean_all(text):
 | |
|     if not isinstance(text, str):
 | |
|         text = str(text) # Convert non-string values to string
 | |
|     text = remove_URL(text)
 | |
|     text = remove_emoji(text)
 | |
|     text = remove_html(text)
 | |
|     text = remove_punct(text)
 | |
|     return text
 | 
