adds tweetLen column, converts keywords to lowercase and removes certain keywords
This commit is contained in:
parent
899a99ba72
commit
3de6d8f3ec
@ -85,6 +85,12 @@ with open(f"{di}keywords-raw.txt", "r") as file:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
keyword = line.strip() # Remove the newline character
|
keyword = line.strip() # Remove the newline character
|
||||||
keywords.append(keyword)
|
keywords.append(keyword)
|
||||||
|
|
||||||
|
# delete keywords ppe and china that lead to too many false positives
|
||||||
|
removeWords = {'ppe', 'china'}
|
||||||
|
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
|
||||||
|
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
||||||
|
|
||||||
with open(f"{di}keywords.txt", "w") as file:
|
with open(f"{di}keywords.txt", "w") as file:
|
||||||
print("read keyword files")
|
print("read keyword files")
|
||||||
for line in keywords:
|
for line in keywords:
|
||||||
@ -94,7 +100,7 @@ with open(f"{di}keywords.txt", "w") as file:
|
|||||||
# overwrite keyword column
|
# overwrite keyword column
|
||||||
df['keywords'] = np.nan
|
df['keywords'] = np.nan
|
||||||
df['keywords'] = (
|
df['keywords'] = (
|
||||||
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', np.nan)
|
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||||
)
|
)
|
||||||
#%%
|
#%%
|
||||||
# create boolean contains_keyword column
|
# create boolean contains_keyword column
|
||||||
@ -159,17 +165,25 @@ print(unique_usernames)
|
|||||||
# create covidtweets csv
|
# create covidtweets csv
|
||||||
dfCov = dfAll[dfAll['contains_keyword']==True]
|
dfCov = dfAll[dfAll['contains_keyword']==True]
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# create column with tweet length
|
||||||
|
|
||||||
|
dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
|
||||||
|
|
||||||
|
# reset df index and write to id column
|
||||||
|
dfCov.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# Export to csv, sav and dta
|
# Export to csv, sav and dta
|
||||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||||
dfCov.to_csv(senCSVcCovPath, encoding='utf-8')
|
dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
|
||||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||||
# dfAllStata = dfAll.rename(columns={'class':'class_'})
|
# dfAllStata = dfAll.rename(columns={'class':'class_'})
|
||||||
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
|
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
|
||||||
# print(dfAllStata.columns)
|
# print(dfAllStata.columns)
|
||||||
# =============================================================================
|
# ====================================================df.id.str.len().value_counts()
|
||||||
|
# =========================
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user