adds cleanTweets.py
This commit is contained in:
parent
82830f13e2
commit
c64904a64d
70
cleanTweets.py
Normal file
70
cleanTweets.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Jun 26 20:36:43 2023
|
||||||
|
|
||||||
|
@author: michael
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pyreadstat
|
||||||
|
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Setup directories
|
||||||
|
# WD Michael
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
# WD Server
|
||||||
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||||
|
|
||||||
|
# Name of new datafile generated
|
||||||
|
senCSVc = "Tweets-Cleaned"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
senCSVPath = wd + ud + senCSV
|
||||||
|
senCSVcPath = wd + ud + senCSV + ".csv"
|
||||||
|
senSAVcPath = wd + ud + senCSV + ".sav"
|
||||||
|
senDTAcPath = wd + ud + senCSV + ".dta"
|
||||||
|
|
||||||
|
df = pd.read_csv(senCSVPath)
|
||||||
|
|
||||||
|
df = df.drop(columns=['user.url', 'user.username', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang'], index=1)
|
||||||
|
del df[df.columns[0]] # remove first col
|
||||||
|
# sort and generate id
|
||||||
|
df = df.sort_values(by='date').reset_index() # sort df by date before generating id
|
||||||
|
df["tid"] = df.index + 1 # create id column
|
||||||
|
# move id column to front
|
||||||
|
cols = list(df.columns.values) # Make a list of all of the columns in the df
|
||||||
|
cols.pop(cols.index('tid')) # Remove id from list
|
||||||
|
cols.pop(cols.index('id')) # Remove id from list
|
||||||
|
df = df[['id','tid']+cols] # Create new dataframe with ordered colums
|
||||||
|
|
||||||
|
# create keyword column
|
||||||
|
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
|
||||||
|
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
|
||||||
|
|
||||||
|
# recode contains keyword to bool
|
||||||
|
mask = (df['contains_keyword'] != 'none')
|
||||||
|
df.loc[mask,'contains_keyword'] = True
|
||||||
|
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords
|
||||||
|
|
||||||
|
pd.Series(df["id"]).is_unique
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Export to csv, sav and dta
|
||||||
|
df_nondupe.to_csv(senCSVcPath)
|
||||||
|
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
|
||||||
|
pyreadstat.write_dta(df, senDTAcPath)
|
||||||
|
"""
|
||||||
|
#
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user