From 3de6d8f3ec4145ab004a265e2d66f6e7c051ff60 Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mischbeck.de>
Date: Mon, 7 Aug 2023 23:07:29 +0200
Subject: [PATCH] adds tweetLen column, converts keywords to lowercase and
 removes certain keywords

---
 cleanTweets.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/cleanTweets.py b/cleanTweets.py
index 3f3b316..99d5e4a 100644
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -85,6 +85,12 @@ with open(f"{di}keywords-raw.txt", "r") as file:
     for line in lines:
         keyword = line.strip()  # Remove the newline character
         keywords.append(keyword)
+
+# delete keywords ppe and china that lead to too many false positives
+removeWords = {'ppe', 'china'}
+keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
+keywords = [item for item in keywords if item not in removeWords ] # removes words
+    
 with open(f"{di}keywords.txt", "w") as file:
     print("read keyword files")
     for line in keywords:
@@ -94,7 +100,7 @@ with open(f"{di}keywords.txt", "w") as file:
 # overwrite keyword column
 df['keywords'] = np.nan
 df['keywords'] = (
-    df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
+    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
 )
 #%%
 # create boolean contains_keyword column
@@ -159,17 +165,25 @@ print(unique_usernames)
 # create covidtweets csv
 dfCov = dfAll[dfAll['contains_keyword']==True]
 
+#%%
+# create column with tweet length
+
+dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
+
+# reset df index and write to id column 
+dfCov.reset_index(drop=True, inplace=True)
+
 #%%
 # Export to csv, sav and dta
 dfAll.to_csv(senCSVcPath, encoding='utf-8')
-dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
+dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
 # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
 # =============================================================================
 # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
 # dfAllStata = dfAll.rename(columns={'class':'class_'})
 # dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
 # print(dfAllStata.columns)
-# =============================================================================
+# ====================================================df.id.str.len().value_counts()
+# =========================
 
 # %%
-