diff --git a/cleanTweets.py b/cleanTweets.py index 99d5e4a..deb6e7c 100644 --- a/cleanTweets.py +++ b/cleanTweets.py @@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file: # delete keywords ppe and china that lead to too many false positives removeWords = {'ppe', 'china'} -keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive +keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison keywords = [item for item in keywords if item not in removeWords ] # removes words with open(f"{di}keywords.txt", "w") as file: @@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file: for line in keywords: file.write(f'{line}\n') +# counter keywords +# Read the keywords from a file +counterKeywords = [] +with open(f"{di}counterKeywords.txt", "r") as file: + lines = file.readlines() + for line in lines: + counterKeyword = line.strip() # Remove the newline character + counterKeywords.append(counterKeyword) +counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison +with open(f"{di}counterKeywordsFinal.txt", "w") as file: + print("read keyword files") + for line in counterKeywords: + file.write(f'{line}\n') + #%% # overwrite keyword column df['keywords'] = np.nan df['keywords'] = ( df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive ) +df['counterKeywords'] = np.nan +df['counterKeywords'] = ( + df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive +) #%% # create boolean contains_keyword column df['contains_keyword'] = True +df['contains_counterKeyword'] = True mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask +mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none' +df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask #%% pd.Series(df["user.id"]).is_unique @@ -163,7 +184,10 @@ print(unique_usernames) # senatorisakson was dropped, is ok #%% # create covidtweets csv -dfCov = dfAll[dfAll['contains_keyword']==True] +dfCov = dfAll[dfAll['contains_counterKeyword']==False] +dfCov = dfCov[dfCov['contains_keyword']==True] +dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords']) + #%% # create column with tweet length diff --git a/data/IN/counterKeywords.txt b/data/IN/counterKeywords.txt new file mode 100644 index 0000000..7a27d38 --- /dev/null +++ b/data/IN/counterKeywords.txt @@ -0,0 +1,17 @@ +opioid +gun violence +gun-violence +CHD +Coronary heart disease +addiction +tobacco +vaping +e-cigarette +shooting +indigenous women +overdose +meth +cocaine +separated children +separating children +separating families