adds CleanTweets functions, creates Graphs
This commit is contained in:
28
analyze.py
28
analyze.py
@@ -5,6 +5,7 @@ import pandas as pd
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||
from datasets import load_dataset
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||
|
||||
|
||||
#%%
|
||||
@@ -82,33 +83,6 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-
|
||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||
|
||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||
def remove_URL(text):
|
||||
url = re.compile(r'https?://\S+|www\.\S+')
|
||||
return url.sub(r'', text)
|
||||
|
||||
|
||||
def remove_emoji(text):
|
||||
emoji_pattern = re.compile(
|
||||
'['
|
||||
u'\U0001F600-\U0001F64F' # emoticons
|
||||
u'\U0001F300-\U0001F5FF' # symbols & pictographs
|
||||
u'\U0001F680-\U0001F6FF' # transport & map symbols
|
||||
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
|
||||
u'\U00002702-\U000027B0'
|
||||
u'\U000024C2-\U0001F251'
|
||||
']+',
|
||||
flags=re.UNICODE)
|
||||
return emoji_pattern.sub(r'', text)
|
||||
|
||||
|
||||
def remove_html(text):
|
||||
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
||||
return re.sub(html, '', text)
|
||||
|
||||
|
||||
def remove_punct(text):
|
||||
table = str.maketrans('', '', string.punctuation)
|
||||
return text.translate(table)
|
||||
|
||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
||||
|
||||
Reference in New Issue
Block a user