adds lines with counterKeywords to remove non-covid tweets
This commit is contained in:
parent
3de6d8f3ec
commit
13d80124d3
@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file:
|
|||||||
|
|
||||||
# delete keywords ppe and china that lead to too many false positives
|
# delete keywords ppe and china that lead to too many false positives
|
||||||
removeWords = {'ppe', 'china'}
|
removeWords = {'ppe', 'china'}
|
||||||
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
|
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||||
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
||||||
|
|
||||||
with open(f"{di}keywords.txt", "w") as file:
|
with open(f"{di}keywords.txt", "w") as file:
|
||||||
@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file:
|
|||||||
for line in keywords:
|
for line in keywords:
|
||||||
file.write(f'{line}\n')
|
file.write(f'{line}\n')
|
||||||
|
|
||||||
|
# counter keywords
|
||||||
|
# Read the keywords from a file
|
||||||
|
counterKeywords = []
|
||||||
|
with open(f"{di}counterKeywords.txt", "r") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
for line in lines:
|
||||||
|
counterKeyword = line.strip() # Remove the newline character
|
||||||
|
counterKeywords.append(counterKeyword)
|
||||||
|
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||||
|
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
|
||||||
|
print("read keyword files")
|
||||||
|
for line in counterKeywords:
|
||||||
|
file.write(f'{line}\n')
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# overwrite keyword column
|
# overwrite keyword column
|
||||||
df['keywords'] = np.nan
|
df['keywords'] = np.nan
|
||||||
df['keywords'] = (
|
df['keywords'] = (
|
||||||
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||||
)
|
)
|
||||||
|
df['counterKeywords'] = np.nan
|
||||||
|
df['counterKeywords'] = (
|
||||||
|
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||||
|
)
|
||||||
#%%
|
#%%
|
||||||
# create boolean contains_keyword column
|
# create boolean contains_keyword column
|
||||||
df['contains_keyword'] = True
|
df['contains_keyword'] = True
|
||||||
|
df['contains_counterKeyword'] = True
|
||||||
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||||
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||||
|
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
|
||||||
|
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
pd.Series(df["user.id"]).is_unique
|
pd.Series(df["user.id"]).is_unique
|
||||||
@ -163,7 +184,10 @@ print(unique_usernames)
|
|||||||
# senatorisakson was dropped, is ok
|
# senatorisakson was dropped, is ok
|
||||||
#%%
|
#%%
|
||||||
# create covidtweets csv
|
# create covidtweets csv
|
||||||
dfCov = dfAll[dfAll['contains_keyword']==True]
|
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
||||||
|
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||||
|
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# create column with tweet length
|
# create column with tweet length
|
||||||
|
17
data/IN/counterKeywords.txt
Normal file
17
data/IN/counterKeywords.txt
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
opioid
|
||||||
|
gun violence
|
||||||
|
gun-violence
|
||||||
|
CHD
|
||||||
|
Coronary heart disease
|
||||||
|
addiction
|
||||||
|
tobacco
|
||||||
|
vaping
|
||||||
|
e-cigarette
|
||||||
|
shooting
|
||||||
|
indigenous women
|
||||||
|
overdose
|
||||||
|
meth
|
||||||
|
cocaine
|
||||||
|
separated children
|
||||||
|
separating children
|
||||||
|
separating families
|
Loading…
x
Reference in New Issue
Block a user