adds lines with counterKeywords to remove non-covid tweets

This commit is contained in:
Michael Beck 2023-08-07 23:45:11 +02:00
parent 3de6d8f3ec
commit 13d80124d3
2 changed files with 43 additions and 2 deletions

View File

@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file:
# delete keywords ppe and china that lead to too many false positives # delete keywords ppe and china that lead to too many false positives
removeWords = {'ppe', 'china'} removeWords = {'ppe', 'china'}
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
keywords = [item for item in keywords if item not in removeWords ] # removes words keywords = [item for item in keywords if item not in removeWords ] # removes words
with open(f"{di}keywords.txt", "w") as file: with open(f"{di}keywords.txt", "w") as file:
@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file:
for line in keywords: for line in keywords:
file.write(f'{line}\n') file.write(f'{line}\n')
# counter keywords
# Read the keywords from a file
counterKeywords = []
with open(f"{di}counterKeywords.txt", "r") as file:
lines = file.readlines()
for line in lines:
counterKeyword = line.strip() # Remove the newline character
counterKeywords.append(counterKeyword)
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
print("read keyword files")
for line in counterKeywords:
file.write(f'{line}\n')
#%% #%%
# overwrite keyword column # overwrite keyword column
df['keywords'] = np.nan df['keywords'] = np.nan
df['keywords'] = ( df['keywords'] = (
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
) )
df['counterKeywords'] = np.nan
df['counterKeywords'] = (
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
)
#%% #%%
# create boolean contains_keyword column # create boolean contains_keyword column
df['contains_keyword'] = True df['contains_keyword'] = True
df['contains_counterKeyword'] = True
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
#%% #%%
pd.Series(df["user.id"]).is_unique pd.Series(df["user.id"]).is_unique
@ -163,7 +184,10 @@ print(unique_usernames)
# senatorisakson was dropped, is ok # senatorisakson was dropped, is ok
#%% #%%
# create covidtweets csv # create covidtweets csv
dfCov = dfAll[dfAll['contains_keyword']==True] dfCov = dfAll[dfAll['contains_counterKeyword']==False]
dfCov = dfCov[dfCov['contains_keyword']==True]
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
#%% #%%
# create column with tweet length # create column with tweet length

View File

@ -0,0 +1,17 @@
opioid
gun violence
gun-violence
CHD
Coronary heart disease
addiction
tobacco
vaping
e-cigarette
shooting
indigenous women
overdose
meth
cocaine
separated children
separating children
separating families