corrects a lot of mistakes.
adds keywords adds analyze.py adds pretest adds pretest ids
This commit is contained in:
129
cleanTweets.py
129
cleanTweets.py
@@ -7,7 +7,9 @@ Created on Mon Jun 26 20:36:43 2023
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import pyreadstat
|
||||
# import pyreadstat
|
||||
# import numpy as np
|
||||
from funs.ClearDupes import deDupe
|
||||
|
||||
|
||||
###################
|
||||
@@ -26,6 +28,9 @@ ud = "data/OUT/"
|
||||
# Name of file that all senator data will be written to
|
||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||
|
||||
# Name of file that all senator data will be written to
|
||||
senDataset = "senators-raw.csv"
|
||||
|
||||
# Name of new datafile generated
|
||||
senCSVc = "Tweets-Cleaned"
|
||||
|
||||
@@ -34,37 +39,135 @@ senCSVPath = wd + ud + senCSV
|
||||
senCSVcPath = wd + ud + senCSV + ".csv"
|
||||
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||
senDatasetPath = wd + di + senDataset
|
||||
|
||||
df = pd.read_csv(senCSVPath)
|
||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||
|
||||
df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1)
|
||||
mixed_columns = df.columns[df.nunique() != len(df)]
|
||||
print(mixed_columns)
|
||||
|
||||
df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
|
||||
del df[df.columns[0]] # remove first col
|
||||
|
||||
df['user.created'] = pd.to_datetime(df['user.created'])
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
#%%
|
||||
# sort and generate id
|
||||
df = df.sort_values(by='date').reset_index() # sort df by date before generating id
|
||||
df["tid"] = df.index + 1 # create id column
|
||||
|
||||
#%%
|
||||
# move id column to front
|
||||
cols = list(df.columns.values) # Make a list of all of the columns in the df
|
||||
cols.pop(cols.index('tid')) # Remove id from list
|
||||
cols.pop(cols.index('id')) # Remove id from list
|
||||
df = df[['id','tid']+cols] # Create new dataframe with ordered colums
|
||||
#cols.pop(cols.index('user')) # Remove id from list
|
||||
df = df[['tid']+cols] # Create new dataframe with ordered colums
|
||||
|
||||
# create keyword column
|
||||
#%%
|
||||
###################
|
||||
# Keywords
|
||||
# read additional keywords from a file and write to list.
|
||||
keywords = []
|
||||
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
|
||||
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
|
||||
# Read the keywords from a file
|
||||
with open(f"{di}own_keywords.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
# write all keywords to file
|
||||
with open(f"{di}keywords-raw.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
for line in lines:
|
||||
keyword = line.strip() # Remove the newline character
|
||||
keywords.append(keyword)
|
||||
with open(f"{di}keywords.txt", "w") as file:
|
||||
print("read keyword files")
|
||||
for line in keywords:
|
||||
file.write(f'{line}\n')
|
||||
|
||||
#%%
|
||||
# overwrite keyword column
|
||||
df['contains_keyword'] = ''
|
||||
df['contains_keyword'] = (
|
||||
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
|
||||
)
|
||||
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
|
||||
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
|
||||
|
||||
#%%
|
||||
# create bool contains_keyword
|
||||
df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
|
||||
#%%
|
||||
# recode contains keyword to bool
|
||||
mask = (df['contains_keyword'] != 'none')
|
||||
df.loc[mask,'contains_keyword'] = True
|
||||
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
|
||||
|
||||
pd.Series(df["id"]).is_unique
|
||||
pd.Series(df["user.id"]).is_unique
|
||||
|
||||
"""
|
||||
#%%
|
||||
# Merge Datasets
|
||||
# get senator data
|
||||
cols = [
|
||||
"name",
|
||||
"id",
|
||||
"state_short",
|
||||
"party",
|
||||
"class",
|
||||
"ideology",
|
||||
"start_serving",
|
||||
"end_serving",
|
||||
"time_in_office",
|
||||
"not_in_office",
|
||||
"last_congress",
|
||||
"vote_share",
|
||||
"next_closest_share",
|
||||
"election_year",
|
||||
"twitter_handle",
|
||||
"alt_handle",
|
||||
"date_of_birth",
|
||||
"female",
|
||||
"ethnicity",
|
||||
"edu_level",
|
||||
"edu_information",
|
||||
"occup_level"]
|
||||
|
||||
dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
|
||||
|
||||
dfSenA['alt'] = False
|
||||
dfSenB['alt'] = True
|
||||
|
||||
dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
|
||||
dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
|
||||
dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
|
||||
|
||||
dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
|
||||
dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
|
||||
df['user.username'] = df['user.username'].apply(str.lower)
|
||||
|
||||
dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
|
||||
|
||||
# %%
|
||||
# see if all senators are present in file
|
||||
dfAll = df.merge(dfSenAll, how='left',on='user.username')
|
||||
#check merge
|
||||
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
|
||||
print(unique_usernames)
|
||||
# senatorisakson was dropped, is ok
|
||||
|
||||
#%%
|
||||
# Export to csv, sav and dta
|
||||
df_nondupe.to_csv(senCSVcPath)
|
||||
dfAll.to_csv(senCSVcPath, encoding='utf-8')
|
||||
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||
pyreadstat.write_dta(df, senDTAcPath)
|
||||
"""
|
||||
#
|
||||
# =============================================================================
|
||||
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
|
||||
# dfAllStata = dfAll.rename(columns={'class':'class_'})
|
||||
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
|
||||
# print(dfAllStata.columns)
|
||||
# =============================================================================
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
Reference in New Issue
Block a user