adds lines with counterKeywords to remove non-covid tweets

2023-08-07 23:45:11 +02:00 · 2023-08-07 23:45:11 +02:00 · 13d80124d3
commit 13d80124d3
parent 3de6d8f3ec
2 changed files with 43 additions and 2 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file:

 # delete keywords ppe and china that lead to too many false positives
 removeWords = {'ppe', 'china'}
-keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
+keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
 keywords = [item for item in keywords if item not in removeWords ] # removes words
    
 with open(f"{di}keywords.txt", "w") as file:
@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file:
    for line in keywords:
        file.write(f'{line}\n')

+# counter keywords
+# Read the keywords from a file
+counterKeywords = []
+with open(f"{di}counterKeywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        counterKeyword = line.strip()  # Remove the newline character
+        counterKeywords.append(counterKeyword)
+counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
+with open(f"{di}counterKeywordsFinal.txt", "w") as file:
+    print("read keyword files")
+    for line in counterKeywords:
+        file.write(f'{line}\n')
+
 #%%
 # overwrite keyword column
 df['keywords'] = np.nan
 df['keywords'] = (
    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
 )
+df['counterKeywords'] = np.nan
+df['counterKeywords'] = (
+    df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
+)
 #%%
 # create boolean contains_keyword column
 df['contains_keyword'] = True
+df['contains_counterKeyword'] = True
 mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
 df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
+mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
+df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask

 #%%
 pd.Series(df["user.id"]).is_unique
@ -163,7 +184,10 @@ print(unique_usernames)
 # senatorisakson was dropped, is ok
 #%%
 # create covidtweets csv
-dfCov = dfAll[dfAll['contains_keyword']==True]
+dfCov = dfAll[dfAll['contains_counterKeyword']==False]
+dfCov = dfCov[dfCov['contains_keyword']==True]
+dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
+

 #%%
 # create column with tweet length
--- a/data/IN/counterKeywords.txt
+++ b/data/IN/counterKeywords.txt
@ -0,0 +1,17 @@
+opioid
+gun violence
+gun-violence
+CHD
+Coronary heart disease
+addiction
+tobacco
+vaping
+e-cigarette
+shooting
+indigenous women
+overdose
+meth
+cocaine
+separated children
+separating children
+separating families