CollectUSSenatorTweets/cleanTweets.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 26 20:36:43 2023

@author: michael
"""

import pandas as pd
# import pyreadstat
import numpy as np
from funs.ClearDupes import deDupe


###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

# datafile input directory
di = "data/IN/"

# Tweet-datafile output directory
ud = "data/OUT/"

# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS-TWEETS.csv"

# Name of file that all senator data will be written to
senDataset = "senators-raw.csv"

# Name of new datafile generated
senCSVc = "SenatorsTweets-Final"
senCSVcCov = "SenatorsTweets-OnlyCov"

# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc + ".csv"
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
senSAVcPath = wd + ud + senCSV + ".sav"
senDTAcPath = wd + ud + senCSV + ".dta"
senDatasetPath = wd + di + senDataset

df = pd.read_csv(senCSVPath, dtype=(object))

mixed_columns = df.columns[df.nunique() != len(df)]
print(mixed_columns)

df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1)
del df[df.columns[0]] # remove first col

df['user.created'] = pd.to_datetime(df['user.created'])
df['date'] = pd.to_datetime(df['date'])

#%%
# sort and generate id
df = df.sort_values(by='date').reset_index() # sort df by date before generating id
df["tid"] = df.index + 1 # create id column

#%%
# move id column to front
cols = list(df.columns.values) # Make a list of all of the columns in the df
cols.pop(cols.index('tid')) # Remove id from list
#cols.pop(cols.index('user')) # Remove id from list
df = df[['tid']+cols] # Create new dataframe with ordered colums

#%%
###################
# Keywords
# read additional keywords from a file and write to list.
keywords = []
# Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
# Read the keywords from a file
with open(f"{di}own_keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
# write all keywords to file
with open(f"{di}keywords-raw.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)

# delete keywords ppe and china that lead to too many false positives
removeWords = {'ppe', 'china'}
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
keywords = [item for item in keywords if item not in removeWords ] # removes words

with open(f"{di}keywords.txt", "w") as file:
    print("read keyword files")
    for line in keywords:
        file.write(f'{line}\n')

# counter keywords
# Read the keywords from a file
counterKeywords = []
with open(f"{di}counterKeywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        counterKeyword = line.strip()  # Remove the newline character
        counterKeywords.append(counterKeyword)
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
    print("read keyword files")
    for line in counterKeywords:
        file.write(f'{line}\n')

#%%
# overwrite keyword column
df['keywords'] = np.nan
df['keywords'] = (
    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
)
df['counterKeywords'] = np.nan
df['counterKeywords'] = (
    df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
)
#%%
# create boolean contains_keyword column
df['contains_keyword'] = True
df['contains_counterKeyword'] = True
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask

#%%
pd.Series(df["user.id"]).is_unique

#%%
# Merge Datasets
# get senator data
cols = [
    "name",
    "id",
    "state_short",
    "party",
    "class",
    "ideology",
    "start_serving",
    "end_serving",
    "time_in_office",
    "not_in_office",
    "last_congress",
    "vote_share",
    "next_closest_share",
    "election_year",
    "twitter_handle",
    "alt_handle",
    "date_of_birth",
    "female",
    "ethnicity",
    "edu_level",
    "edu_information",
    "occup_level"]

dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()
dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index()

dfSenA['alt'] = False
dfSenB['alt'] = True

dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'})
dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'})
dfSenB = dfSenB.dropna(axis=0, subset=['user.username'])

dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower)
dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower)
df['user.username'] = df['user.username'].apply(str.lower)

dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index()

# %%
# see if all senators are present in file
dfAll = df.merge(dfSenAll, how='left',on='user.username')
#check merge
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
print(unique_usernames)
# senatorisakson was dropped, is ok
#%%
# create covidtweets csv
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
dfCov = dfCov[dfCov['contains_keyword']==True]
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])


#%%
# create column with tweet length

dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()

# reset df index and write to id column
dfCov.reset_index(drop=True, inplace=True)

#%%
# Export to csv, sav and dta
dfAll.to_csv(senCSVcPath, encoding='utf-8')
dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
# =============================================================================
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
# dfAllStata = dfAll.rename(columns={'class':'class_'})
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
# print(dfAllStata.columns)
# ====================================================df.id.str.len().value_counts()
# =========================

# %%