diff --git a/funs/CleanTweets.py b/funs/CleanTweets.py index bf87e03..18ee6c1 100644 --- a/funs/CleanTweets.py +++ b/funs/CleanTweets.py @@ -5,7 +5,6 @@ def remove_URL(text): url = re.compile(r'https?://\S+|www\.\S+') return url.sub(r'', text) - def remove_emoji(text): emoji_pattern = re.compile( '[' @@ -19,21 +18,61 @@ def remove_emoji(text): flags=re.UNICODE) return emoji_pattern.sub(r'', text) - def remove_html(text): html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') return re.sub(html, '', text) - def remove_punct(text): table = str.maketrans('', '', string.punctuation) return text.translate(table) -def clean_all(text): - if not isinstance(text, str): - text = str(text) # Convert non-string values to string +def remove_nonascii(text): + return re.sub(r'[^\x00-\x7F]+', '', text) + +def remove_spec(text): + text = re.sub(r'&?', r'and', text) + text = re.sub(r'<', r'<', text) + return re.sub(r'>', r'>', text) + +def remove_spaces(text): # also new line chars and to lower case + text = re.sub(r'<', r'<', text) + text = " ".join(text.splitlines()) # remove newline characters + text = text.lower() + text = text.strip() + return re.sub(r'\s{2,}', ' ', text) + +def remove_retw(text): + text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text) + return re.sub(r'@[\S]+', '', text) + +def preprocess_text(text): text = remove_URL(text) text = remove_emoji(text) text = remove_html(text) text = remove_punct(text) + text = remove_nonascii(text) + text = remove_spec(text) + text = remove_spaces(text) + text = remove_retw(text) return text + +def preprocess_text_series(series): + series = series.apply(remove_URL) + series = series.apply(remove_emoji) + series = series.apply(remove_html) + series = series.apply(remove_punct) + series = series.apply(remove_nonascii) + series = series.apply(remove_spec) + series = series.apply(remove_spaces) + series = series.apply(remove_retw) + return series + +# Check all functions: +input_text = """ + Check out this amazing website: https://www.example.com! 😃 + This is an HTML tag. + RT @user123: Just received a package from @companyXYZ. It's awesome! 📦 + This is a test text with lots of punctuations!!! Can't wait to see more... +""" +processed_text = preprocess_text(input_text) +# print(processed_text) \ No newline at end of file