From 3de6d8f3ec4145ab004a265e2d66f6e7c051ff60 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Mon, 7 Aug 2023 23:07:29 +0200 Subject: [PATCH] adds tweetLen column, converts keywords to lowercase and removes certain keywords --- cleanTweets.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cleanTweets.py b/cleanTweets.py index 3f3b316..99d5e4a 100644 --- a/cleanTweets.py +++ b/cleanTweets.py @@ -85,6 +85,12 @@ with open(f"{di}keywords-raw.txt", "r") as file: for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) + +# delete keywords ppe and china that lead to too many false positives +removeWords = {'ppe', 'china'} +keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive +keywords = [item for item in keywords if item not in removeWords ] # removes words + with open(f"{di}keywords.txt", "w") as file: print("read keyword files") for line in keywords: @@ -94,7 +100,7 @@ with open(f"{di}keywords.txt", "w") as file: # overwrite keyword column df['keywords'] = np.nan df['keywords'] = ( - df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) + df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive ) #%% # create boolean contains_keyword column @@ -159,17 +165,25 @@ print(unique_usernames) # create covidtweets csv dfCov = dfAll[dfAll['contains_keyword']==True] +#%% +# create column with tweet length + +dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy() + +# reset df index and write to id column +dfCov.reset_index(drop=True, inplace=True) + #%% # Export to csv, sav and dta dfAll.to_csv(senCSVcPath, encoding='utf-8') -dfCov.to_csv(senCSVcCovPath, encoding='utf-8') +dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id') # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb # ============================================================================= # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) # dfAllStata = dfAll.rename(columns={'class':'class_'}) # dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'}) # print(dfAllStata.columns) -# ============================================================================= +# ====================================================df.id.str.len().value_counts() +# ========================= # %% -