adds tweetLen column, converts keywords to lowercase and removes certain keywords

This commit is contained in:
Michael Beck 2023-08-07 23:07:29 +02:00
parent 899a99ba72
commit 3de6d8f3ec

View File

@ -85,6 +85,12 @@ with open(f"{di}keywords-raw.txt", "r") as file:
for line in lines: for line in lines:
keyword = line.strip() # Remove the newline character keyword = line.strip() # Remove the newline character
keywords.append(keyword) keywords.append(keyword)
# delete keywords ppe and china that lead to too many false positives
removeWords = {'ppe', 'china'}
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
keywords = [item for item in keywords if item not in removeWords ] # removes words
with open(f"{di}keywords.txt", "w") as file: with open(f"{di}keywords.txt", "w") as file:
print("read keyword files") print("read keyword files")
for line in keywords: for line in keywords:
@ -94,7 +100,7 @@ with open(f"{di}keywords.txt", "w") as file:
# overwrite keyword column # overwrite keyword column
df['keywords'] = np.nan df['keywords'] = np.nan
df['keywords'] = ( df['keywords'] = (
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
) )
#%% #%%
# create boolean contains_keyword column # create boolean contains_keyword column
@ -159,17 +165,25 @@ print(unique_usernames)
# create covidtweets csv # create covidtweets csv
dfCov = dfAll[dfAll['contains_keyword']==True] dfCov = dfAll[dfAll['contains_keyword']==True]
#%%
# create column with tweet length
dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
# reset df index and write to id column
dfCov.reset_index(drop=True, inplace=True)
#%% #%%
# Export to csv, sav and dta # Export to csv, sav and dta
dfAll.to_csv(senCSVcPath, encoding='utf-8') dfAll.to_csv(senCSVcPath, encoding='utf-8')
dfCov.to_csv(senCSVcCovPath, encoding='utf-8') dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
# ============================================================================= # =============================================================================
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
# dfAllStata = dfAll.rename(columns={'class':'class_'}) # dfAllStata = dfAll.rename(columns={'class':'class_'})
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'}) # dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
# print(dfAllStata.columns) # print(dfAllStata.columns)
# ============================================================================= # ====================================================df.id.str.len().value_counts()
# =========================
# %% # %%