corrects a lot of mistakes.

adds keywords adds analyze.py adds pretest adds pretest ids
2023-07-07 00:16:44 +02:00
parent c64904a64d
commit 817ec48478
9 changed files with 695 additions and 124 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -7,7 +7,9 @@ Created on Mon Jun 26 20:36:43 2023
 """

 import pandas as pd
-import pyreadstat
+# import pyreadstat
+# import numpy as np
+from funs.ClearDupes import deDupe


 ###################
@@ -26,6 +28,9 @@ ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "ALL-SENATORS-TWEETS.csv"

+# Name of file that all senator data will be written to
+senDataset = "senators-raw.csv"
+
 # Name of new datafile generated
 senCSVc = "Tweets-Cleaned"

@@ -34,37 +39,135 @@ senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSV + ".csv"
 senSAVcPath = wd + ud + senCSV + ".sav"
 senDTAcPath = wd + ud + senCSV + ".dta"
+senDatasetPath = wd + di + senDataset

-df = pd.read_csv(senCSVPath)
+df = pd.read_csv(senCSVPath, dtype=(object))

-df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1)
+mixed_columns = df.columns[df.nunique() != len(df)]
+print(mixed_columns)
+
+df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
 del df[df.columns[0]] # remove first col
+
+df['user.created'] = pd.to_datetime(df['user.created'])
+df['date'] = pd.to_datetime(df['date'])
+
+#%%
 # sort and generate id
 df = df.sort_values(by='date').reset_index() # sort df by date before generating id
 df["tid"] = df.index + 1 # create id column
+
+#%%
 # move id column to front 
 cols = list(df.columns.values) # Make a list of all of the columns in the df
 cols.pop(cols.index('tid')) # Remove id from list
-cols.pop(cols.index('id')) # Remove id from list
-df = df[['id','tid']+cols] # Create new dataframe with ordered colums
+#cols.pop(cols.index('user')) # Remove id from list
+df = df[['tid']+cols] # Create new dataframe with ordered colums

-# create keyword column
+#%%
+###################
+# Keywords
+# read additional keywords from a file and write to list.
+keywords = []
+# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
+# Read the keywords from a file
+with open(f"{di}own_keywords.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+# write all keywords to file
+with open(f"{di}keywords-raw.txt", "r") as file:
+    lines = file.readlines()
+    for line in lines:
+        keyword = line.strip()  # Remove the newline character
+        keywords.append(keyword)
+with open(f"{di}keywords.txt", "w") as file:
+    print("read keyword files")
+    for line in keywords:
+        file.write(f'{line}\n')
+
+#%%
+# overwrite keyword column
+df['contains_keyword'] = ''
+df['contains_keyword'] = (
+    df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none')
+)
 mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
 df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
-
+#%%
+# create bool contains_keyword
+df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column
+#%%
 # recode contains keyword to bool
 mask = (df['contains_keyword'] != 'none')
 df.loc[mask,'contains_keyword'] = True
 df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords

-pd.Series(df["id"]).is_unique
+pd.Series(df["user.id"]).is_unique

-"""
+#%%
+# Merge Datasets
+# get senator data
+cols = [
+    "name",
+    "id",
+    "state_short",
+    "party",
+    "class",
+    "ideology",
+    "start_serving",
+    "end_serving",
+    "time_in_office",
+    "not_in_office",
+    "last_congress",
+    "vote_share",
+    "next_closest_share",
+    "election_year",
+    "twitter_handle",
+    "alt_handle",
+    "date_of_birth",
+    "female",
+    "ethnicity",
+    "edu_level",
+    "edu_information",
+    "occup_level"]
+
+dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
+
+dfSenA['alt'] = False
+dfSenB['alt'] = True
+
+dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
+dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
+dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])
+
+dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
+dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
+df['user.username'] = df['user.username'].apply(str.lower)
+
+dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()
+
+# %%
+# see if all senators are present in file
+dfAll = df.merge(dfSenAll, how='left',on='user.username')
+#check merge
+unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
+print(unique_usernames)
+# senatorisakson was dropped, is ok
+
+#%%
 # Export to csv, sav and dta
-df_nondupe.to_csv(senCSVcPath)
+dfAll.to_csv(senCSVcPath, encoding='utf-8')
 # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
-pyreadstat.write_dta(df, senDTAcPath)
-"""
-#
+# =============================================================================
+# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
+# dfAllStata = dfAll.rename(columns={'class':'class_'})
+# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
+# print(dfAllStata.columns)
+# =============================================================================

+# %%