corrects import of own functions that didn't work anymore because of a newer python version.

2023-08-30 21:45:27 +02:00 · 2023-08-30 21:45:27 +02:00 · d8136909c8
commit d8136909c8
parent 1c6d9d5415
3 changed files with 19 additions and 17 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@ -9,7 +9,8 @@ Created on Mon Jun 26 20:36:43 2023
 import pandas as pd
 # import pyreadstat
 import numpy as np
-from funs.ClearDupes import deDupe
+import sys
+

 # Seet for training dataset generation
 seed = 86431891
@ -49,6 +50,11 @@ senDatasetPath = wd + di + senDataset

 df = pd.read_csv(senCSVPath, dtype=(object))

+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from ClearDupes import deDupe
+
 mixed_columns = df.columns[df.nunique() != len(df)]
 print(mixed_columns)

--- a/collect.py
+++ b/collect.py
@ -66,7 +66,6 @@ which is the final output.
 import os
 import pandas as pd
 import glob
-import time
 import sys
 from datetime import datetime
 import concurrent.futures
@ -149,10 +148,12 @@ tweetDFColumns = [
 ################## do NOT change anything below this line ###################
 #############################################################################

-## Import functions
-from funs.TimeSlice import *
-from funs.ClearDupes import deDupe
-from funs.Scrape import scrapeTweets
+## Import own functions
+funs = wd+"funs"
+sys.path.insert(1, funs)
+from TimeSlice import get_Tslices
+from ClearDupes import deDupe
+from Scrape import scrapeTweets

 ################### 
 # Create logfile & log all outputs
--- a/preTestClassification.py
+++ b/preTestClassification.py
@ -1,13 +1,8 @@
-import re
-import string
-import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
-from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
-

 #%%
 # prepare
@ -40,7 +35,6 @@ senCSVPretest = "Pretest.csv"
 senCSVPretestPrep = "Pretest-Prep.csv"
 senCSVPretestResult = "Pretest-Results.csv"

-
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc
@ -50,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult
 preTestIDsFakePath = wd + di + preTestIDsFake
 preTestIDsNotPath = wd + di + preTestIDsNot

+import sys
+funs = wd+"funs"
+sys.path.insert(1, funs)
+import CleanTweets
+
 # List of IDs to select
 # Read the IDs from a file
 preTestIDsFakeL = []
@ -85,11 +84,7 @@ tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")

 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert

-dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
-dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
+dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)

 #%%
 timeStart = datetime.now() # start counting execution time