adds lines with counterKeywords to remove non-covid tweets
This commit is contained in:
		| @@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file: | |||||||
|  |  | ||||||
| # delete keywords ppe and china that lead to too many false positives | # delete keywords ppe and china that lead to too many false positives | ||||||
| removeWords = {'ppe', 'china'} | removeWords = {'ppe', 'china'} | ||||||
| keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive | keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison | ||||||
| keywords = [item for item in keywords if item not in removeWords ] # removes words | keywords = [item for item in keywords if item not in removeWords ] # removes words | ||||||
|      |      | ||||||
| with open(f"{di}keywords.txt", "w") as file: | with open(f"{di}keywords.txt", "w") as file: | ||||||
| @@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file: | |||||||
|     for line in keywords: |     for line in keywords: | ||||||
|         file.write(f'{line}\n') |         file.write(f'{line}\n') | ||||||
|  |  | ||||||
|  | # counter keywords | ||||||
|  | # Read the keywords from a file | ||||||
|  | counterKeywords = [] | ||||||
|  | with open(f"{di}counterKeywords.txt", "r") as file: | ||||||
|  |     lines = file.readlines() | ||||||
|  |     for line in lines: | ||||||
|  |         counterKeyword = line.strip()  # Remove the newline character | ||||||
|  |         counterKeywords.append(counterKeyword) | ||||||
|  | counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison | ||||||
|  | with open(f"{di}counterKeywordsFinal.txt", "w") as file: | ||||||
|  |     print("read keyword files") | ||||||
|  |     for line in counterKeywords: | ||||||
|  |         file.write(f'{line}\n') | ||||||
|  |  | ||||||
| #%% | #%% | ||||||
| # overwrite keyword column | # overwrite keyword column | ||||||
| df['keywords'] = np.nan | df['keywords'] = np.nan | ||||||
| df['keywords'] = ( | df['keywords'] = ( | ||||||
|     df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive |     df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive | ||||||
| ) | ) | ||||||
|  | df['counterKeywords'] = np.nan | ||||||
|  | df['counterKeywords'] = ( | ||||||
|  |     df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive | ||||||
|  | ) | ||||||
| #%% | #%% | ||||||
| # create boolean contains_keyword column | # create boolean contains_keyword column | ||||||
| df['contains_keyword'] = True | df['contains_keyword'] = True | ||||||
|  | df['contains_counterKeyword'] = True | ||||||
| mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' | mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' | ||||||
| df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask | df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask | ||||||
|  | mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none' | ||||||
|  | df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask | ||||||
|  |  | ||||||
| #%% | #%% | ||||||
| pd.Series(df["user.id"]).is_unique | pd.Series(df["user.id"]).is_unique | ||||||
| @@ -163,7 +184,10 @@ print(unique_usernames) | |||||||
| # senatorisakson was dropped, is ok | # senatorisakson was dropped, is ok | ||||||
| #%% | #%% | ||||||
| # create covidtweets csv | # create covidtweets csv | ||||||
| dfCov = dfAll[dfAll['contains_keyword']==True] | dfCov = dfAll[dfAll['contains_counterKeyword']==False] | ||||||
|  | dfCov = dfCov[dfCov['contains_keyword']==True] | ||||||
|  | dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords']) | ||||||
|  |  | ||||||
|  |  | ||||||
| #%% | #%% | ||||||
| # create column with tweet length | # create column with tweet length | ||||||
|   | |||||||
							
								
								
									
										17
									
								
								data/IN/counterKeywords.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								data/IN/counterKeywords.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | |||||||
|  | opioid | ||||||
|  | gun violence | ||||||
|  | gun-violence | ||||||
|  | CHD | ||||||
|  | Coronary heart disease | ||||||
|  | addiction | ||||||
|  | tobacco | ||||||
|  | vaping | ||||||
|  | e-cigarette | ||||||
|  | shooting | ||||||
|  | indigenous women | ||||||
|  | overdose | ||||||
|  | meth | ||||||
|  | cocaine | ||||||
|  | separated children | ||||||
|  | separating children | ||||||
|  | separating families | ||||||
		Reference in New Issue
	
	Block a user
	 Michael Beck
					Michael Beck