adds cleanTweets.py

2023-06-26 23:51:32 +02:00
parent 82830f13e2
commit c64904a64d
1 changed files with 70 additions and 0 deletions
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 26 20:36:43 2023
+
+@author: michael
+"""
+
+import pandas as pd
+import pyreadstat
+
+
+###################
+# Setup directories
+# WD Michael
+wd = "/home/michael/Documents/PS/Data/collectTweets/"
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# datafile input directory
+di = "data/IN/"
+
+# Tweet-datafile output directory
+ud = "data/OUT/"
+
+# Name of file that all senator data will be written to
+senCSV = "ALL-SENATORS-TWEETS.csv"
+
+# Name of new datafile generated
+senCSVc = "Tweets-Cleaned"
+
+# don't change this one
+senCSVPath = wd + ud + senCSV
+senCSVcPath = wd + ud + senCSV + ".csv"
+senSAVcPath = wd + ud + senCSV + ".sav"
+senDTAcPath = wd + ud + senCSV + ".dta"
+
+df = pd.read_csv(senCSVPath)
+
+df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1)
+del df[df.columns[0]] # remove first col
+# sort and generate id
+df = df.sort_values(by='date').reset_index() # sort df by date before generating id
+df["tid"] = df.index + 1 # create id column
+# move id column to front 
+cols = list(df.columns.values) # Make a list of all of the columns in the df
+cols.pop(cols.index('tid')) # Remove id from list
+cols.pop(cols.index('id')) # Remove id from list
+df = df[['id','tid']+cols] # Create new dataframe with ordered colums
+
+# create keyword column
+mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
+df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
+
+# recode contains keyword to bool
+mask = (df['contains_keyword'] != 'none')
+df.loc[mask,'contains_keyword'] = True
+df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
+
+pd.Series(df["id"]).is_unique
+
+"""
+# Export to csv, sav and dta
+df_nondupe.to_csv(senCSVcPath)
+# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb 
+pyreadstat.write_dta(df, senDTAcPath)
+"""
+#
+
+