diff --git a/cleanTweets.py b/cleanTweets.py index 19957db..f9d202e 100644 --- a/cleanTweets.py +++ b/cleanTweets.py @@ -9,7 +9,8 @@ Created on Mon Jun 26 20:36:43 2023 import pandas as pd # import pyreadstat import numpy as np -from funs.ClearDupes import deDupe +import sys + # Seet for training dataset generation seed = 86431891 @@ -49,6 +50,11 @@ senDatasetPath = wd + di + senDataset df = pd.read_csv(senCSVPath, dtype=(object)) +## Import own functions +funs = wd+"funs" +sys.path.insert(1, funs) +from ClearDupes import deDupe + mixed_columns = df.columns[df.nunique() != len(df)] print(mixed_columns) diff --git a/collect.py b/collect.py index fb05356..05682d8 100644 --- a/collect.py +++ b/collect.py @@ -66,7 +66,6 @@ which is the final output. import os import pandas as pd import glob -import time import sys from datetime import datetime import concurrent.futures @@ -149,10 +148,12 @@ tweetDFColumns = [ ################## do NOT change anything below this line ################### ############################################################################# -## Import functions -from funs.TimeSlice import * -from funs.ClearDupes import deDupe -from funs.Scrape import scrapeTweets +## Import own functions +funs = wd+"funs" +sys.path.insert(1, funs) +from TimeSlice import get_Tslices +from ClearDupes import deDupe +from Scrape import scrapeTweets ################### # Create logfile & log all outputs diff --git a/preTestClassification.py b/preTestClassification.py index 9a6de23..5c419ee 100644 --- a/preTestClassification.py +++ b/preTestClassification.py @@ -1,13 +1,8 @@ -import re -import string -import numpy as np import pandas as pd from datetime import datetime from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from datasets import load_dataset from transformers.pipelines.pt_utils import KeyDataset -from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct - #%% # prepare @@ -40,7 +35,6 @@ senCSVPretest = "Pretest.csv" senCSVPretestPrep = "Pretest-Prep.csv" senCSVPretestResult = "Pretest-Results.csv" - # don't change this one senCSVPath = wd + ud + senCSV senCSVcPath = wd + ud + senCSVc @@ -50,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult preTestIDsFakePath = wd + di + preTestIDsFake preTestIDsNotPath = wd + di + preTestIDsNot +import sys +funs = wd+"funs" +sys.path.insert(1, funs) +import CleanTweets + # List of IDs to select # Read the IDs from a file preTestIDsFakeL = [] @@ -85,11 +84,7 @@ tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert -dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct) -dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower()) +dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text) #%% timeStart = datetime.now() # start counting execution time