adds tweet-text-cleaning functions
This commit is contained in:
parent
5a63c478e9
commit
881d3d6d6d
@ -5,7 +5,6 @@ def remove_URL(text):
|
||||
url = re.compile(r'https?://\S+|www\.\S+')
|
||||
return url.sub(r'', text)
|
||||
|
||||
|
||||
def remove_emoji(text):
|
||||
emoji_pattern = re.compile(
|
||||
'['
|
||||
@ -19,21 +18,61 @@ def remove_emoji(text):
|
||||
flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r'', text)
|
||||
|
||||
|
||||
def remove_html(text):
|
||||
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
||||
return re.sub(html, '', text)
|
||||
|
||||
|
||||
def remove_punct(text):
|
||||
table = str.maketrans('', '', string.punctuation)
|
||||
return text.translate(table)
|
||||
|
||||
def clean_all(text):
|
||||
if not isinstance(text, str):
|
||||
text = str(text) # Convert non-string values to string
|
||||
def remove_nonascii(text):
|
||||
return re.sub(r'[^\x00-\x7F]+', '', text)
|
||||
|
||||
def remove_spec(text):
|
||||
text = re.sub(r'&?', r'and', text)
|
||||
text = re.sub(r'<', r'<', text)
|
||||
return re.sub(r'>', r'>', text)
|
||||
|
||||
def remove_spaces(text): # also new line chars and to lower case
|
||||
text = re.sub(r'<', r'<', text)
|
||||
text = " ".join(text.splitlines()) # remove newline characters
|
||||
text = text.lower()
|
||||
text = text.strip()
|
||||
return re.sub(r'\s{2,}', ' ', text)
|
||||
|
||||
def remove_retw(text):
|
||||
text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
|
||||
return re.sub(r'@[\S]+', '', text)
|
||||
|
||||
def preprocess_text(text):
|
||||
text = remove_URL(text)
|
||||
text = remove_emoji(text)
|
||||
text = remove_html(text)
|
||||
text = remove_punct(text)
|
||||
text = remove_nonascii(text)
|
||||
text = remove_spec(text)
|
||||
text = remove_spaces(text)
|
||||
text = remove_retw(text)
|
||||
return text
|
||||
|
||||
def preprocess_text_series(series):
|
||||
series = series.apply(remove_URL)
|
||||
series = series.apply(remove_emoji)
|
||||
series = series.apply(remove_html)
|
||||
series = series.apply(remove_punct)
|
||||
series = series.apply(remove_nonascii)
|
||||
series = series.apply(remove_spec)
|
||||
series = series.apply(remove_spaces)
|
||||
series = series.apply(remove_retw)
|
||||
return series
|
||||
|
||||
# Check all functions:
|
||||
input_text = """
|
||||
Check out this amazing website: https://www.example.com! 😃
|
||||
<html>This is an HTML tag.</html>
|
||||
RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
|
||||
This is a test text with lots of punctuations!!! Can't wait to see more...
|
||||
"""
|
||||
processed_text = preprocess_text(input_text)
|
||||
# print(processed_text)
|
Loading…
x
Reference in New Issue
Block a user