#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 26 20:36:43 2023 @author: michael """ import pandas as pd # import pyreadstat import numpy as np from funs.ClearDupes import deDupe # Seet for training dataset generation seed = 86431891 ################### # Setup directories # WD Michael wd = "/home/michael/Documents/PS/Data/collectTweets/" # WD Server # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' # datafile input directory di = "data/IN/" # Tweet-datafile output directory ud = "data/OUT/" # Name of file that all senator data will be written to senCSV = "ALL-SENATORS-TWEETS.csv" # Name of file that all senator data will be written to senDataset = "senators-raw.csv" # Name of new datafile generated senCSVc = "SenatorsTweets-Final" senCSVcCov = "SenatorsTweets-OnlyCov" senCSVcTrain = "SenatorsTweets-Training" # don't change this one senCSVPath = wd + ud + senCSV senCSVcPath = wd + ud + senCSVc + ".csv" senCSVcCovPath = wd + ud + senCSVcCov + ".csv" senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv" senSAVcPath = wd + ud + senCSV + ".sav" senDTAcPath = wd + ud + senCSV + ".dta" senDatasetPath = wd + di + senDataset df = pd.read_csv(senCSVPath, dtype=(object)) mixed_columns = df.columns[df.nunique() != len(df)] print(mixed_columns) df = df.drop(columns=['user.url', 'cashtags', 'coordinates', 'hashtags', 'Unnamed: 0', 'user.verified', 'lang', 'renderedContent', 'retweetedTweet', 'sourceLabel', 'sourceUrl', 'source'], index=1) del df[df.columns[0]] # remove first col df['user.created'] = pd.to_datetime(df['user.created']) df['date'] = pd.to_datetime(df['date']) #%% # sort and generate id df = df.sort_values(by='date').reset_index() # sort df by date before generating id df["tid"] = df.index + 1 # create id column #%% # move id column to front cols = list(df.columns.values) # Make a list of all of the columns in the df cols.pop(cols.index('tid')) # Remove id from list #cols.pop(cols.index('user')) # Remove id from list df = df[['tid']+cols] # Create new dataframe with ordered colums #%% ################### # Keywords # read additional keywords from a file and write to list. keywords = [] # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt' deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt") # Read the keywords from a file with open(f"{di}own_keywords.txt", "r") as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) # write all keywords to file with open(f"{di}keywords-raw.txt", "r") as file: lines = file.readlines() for line in lines: keyword = line.strip() # Remove the newline character keywords.append(keyword) # delete keywords ppe and china that lead to too many false positives removeWords = {'ppe', 'china'} keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison keywords = [item for item in keywords if item not in removeWords ] # removes words with open(f"{di}keywords.txt", "w") as file: print("read keyword files") for line in keywords: file.write(f'{line}\n') # counter keywords # Read the keywords from a file counterKeywords = [] with open(f"{di}counterKeywords.txt", "r") as file: lines = file.readlines() for line in lines: counterKeyword = line.strip() # Remove the newline character counterKeywords.append(counterKeyword) counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison with open(f"{di}counterKeywordsFinal.txt", "w") as file: print("read keyword files") for line in counterKeywords: file.write(f'{line}\n') #%% # overwrite keyword column df['keywords'] = np.nan df['keywords'] = ( df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive ) df['counterKeywords'] = np.nan df['counterKeywords'] = ( df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive ) #%% # create boolean contains_keyword column df['contains_keyword'] = True df['contains_counterKeyword'] = True mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none' df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none' df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask #%% pd.Series(df["user.id"]).is_unique #%% # Merge Datasets # get senator data cols = [ "name", "id", "state_short", "party", "class", "ideology", "start_serving", "end_serving", "time_in_office", "not_in_office", "last_congress", "vote_share", "next_closest_share", "election_year", "twitter_handle", "alt_handle", "date_of_birth", "female", "ethnicity", "edu_level", "edu_information", "occup_level"] dfSenA = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index() dfSenB = pd.read_csv(senDatasetPath, index_col=False, sep = ",", usecols=cols).reset_index() dfSenA['alt'] = False dfSenB['alt'] = True dfSenA = dfSenA.rename(columns={'twitter_handle': 'user.username'}) dfSenB = dfSenB.rename(columns={'alt_handle': 'user.username'}) dfSenB = dfSenB.dropna(axis=0, subset=['user.username']) dfSenA['user.username'] = dfSenA['user.username'].apply(str.lower) dfSenB['user.username'] = dfSenB['user.username'].apply(str.lower) df['user.username'] = df['user.username'].apply(str.lower) dfSenAll = pd.concat([dfSenA, dfSenB]).reset_index() # %% # see if all senators are present in file dfAll = df.merge(dfSenAll, how='left',on='user.username') #check merge unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique() print(unique_usernames) # senatorisakson was dropped, is ok #%% # create covidtweets csv dfCov = dfAll[dfAll['contains_counterKeyword']==False] dfCov = dfCov[dfCov['contains_keyword']==True] dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords']) #%% # create column with tweet length dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy() # reset df index and write to id column dfCov.reset_index(drop=True, inplace=True) #%% # Export to csv, sav and dta dfAll.to_csv(senCSVcPath, encoding='utf-8') dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id') # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb # ============================================================================= # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) # dfAllStata = dfAll.rename(columns={'class':'class_'}) # dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'}) # print(dfAllStata.columns) # ====================================================df.id.str.len().value_counts() # ========================= # %% # Create training dataset np.random.seed(seed); dfTrain = pd.dfCov(np.random.rand(1800)) # %% # Create training dataset np.random.seed(seed); dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)] dfTrain = dfTrain[['tid', 'date', 'rawContent']] dfTrain['topicCovid'] = True dfTrain['fake'] = False dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')