#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 26 20:36:43 2023 @author: michael """ import pandas as pd import pyreadstat ################### # Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # datafile input directory di = "data/IN/" # Tweet-datafile output directory ud = "data/OUT/" # Name of file that all senator data will be written to senCSV = "ALL-SENATORS-TWEETS.csv" # Name of new datafile generated senCSVc = "Tweets-Cleaned" # don't change this one senCSVPath = wd + ud + senCSV senCSVcPath = wd + ud + senCSV + ".csv" senSAVcPath = wd + ud + senCSV + ".sav" senDTAcPath = wd + ud + senCSV + ".dta" df = pd.read_csv(senCSVPath) df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1) del df[df.columns[0]] # remove first col # sort and generate id df = df.sort_values(by='date').reset_index() # sort df by date before generating id df["tid"] = df.index + 1 # create id column # move id column to front cols = list(df.columns.values) # Make a list of all of the columns in the df cols.pop(cols.index('tid')) # Remove id from list cols.pop(cols.index('id')) # Remove id from list df = df[['id','tid']+cols] # Create new dataframe with ordered colums # create keyword column mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none' df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask # recode contains keyword to bool mask = (df['contains_keyword'] != 'none') df.loc[mask,'contains_keyword'] = True df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords pd.Series(df["id"]).is_unique """ # Export to csv, sav and dta df_nondupe.to_csv(senCSVcPath) # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb pyreadstat.write_dta(df, senDTAcPath) """ #