diff --git a/cleanTweets.py b/cleanTweets.py new file mode 100644 index 0000000..f902dff --- /dev/null +++ b/cleanTweets.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 26 20:36:43 2023 + +@author: michael +""" + +import pandas as pd +import pyreadstat + + +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "ALL-SENATORS-TWEETS.csv" + +# Name of new datafile generated +senCSVc = "Tweets-Cleaned" + +# don't change this one +senCSVPath = wd + ud + senCSV +senCSVcPath = wd + ud + senCSV + ".csv" +senSAVcPath = wd + ud + senCSV + ".sav" +senDTAcPath = wd + ud + senCSV + ".dta" + +df = pd.read_csv(senCSVPath) + +df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1) +del df[df.columns[0]] # remove first col +# sort and generate id +df = df.sort_values(by='date').reset_index() # sort df by date before generating id +df["tid"] = df.index + 1 # create id column +# move id column to front +cols = list(df.columns.values) # Make a list of all of the columns in the df +cols.pop(cols.index('tid')) # Remove id from list +cols.pop(cols.index('id')) # Remove id from list +df = df[['id','tid']+cols] # Create new dataframe with ordered colums + +# create keyword column +mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none' +df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask + +# recode contains keyword to bool +mask = (df['contains_keyword'] != 'none') +df.loc[mask,'contains_keyword'] = True +df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords + +pd.Series(df["id"]).is_unique + +""" +# Export to csv, sav and dta +df_nondupe.to_csv(senCSVcPath) +# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb +pyreadstat.write_dta(df, senDTAcPath) +""" +# + +