From 5a63c478e910ddc8724af2f181fb13b8b42cc3ec Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Tue, 8 Aug 2023 15:32:12 +0200 Subject: [PATCH] adds dataset profiler --- profiler.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 profiler.py diff --git a/profiler.py b/profiler.py new file mode 100644 index 0000000..b12ab85 --- /dev/null +++ b/profiler.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 8 14:49:02 2023 + +@author: michael +""" + +import pandas as pd +import pandas_profiling as pp +import numpy + +################### +# Setup directories +# WD Michael +wd = "/home/michael/Documents/PS/Data/collectTweets/" +# WD Server +# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/' + +# datafile input directory +di = "data/IN/" + +# Tweet-datafile output directory +ud = "data/OUT/" + +# Name of file that all senator data will be written to +senCSV = "ALL-SENATORS-TWEETS.csv" + +# Name of file that all senator data will be written to +senDataset = "senators-raw.csv" + +# Name of new datafile generated +senCSVc = "SenatorsTweets-Final" +senCSVcCov = "SenatorsTweets-OnlyCov" + +# don't change this one +senCSVPath = wd + ud + senCSV +senCSVcPath = wd + ud + senCSVc + ".csv" +senCSVcCovPath = wd + ud + senCSVcCov + ".csv" +senSAVcPath = wd + ud + senCSV + ".sav" +senDTAcPath = wd + ud + senCSV + ".dta" +senDatasetPath = wd + di + senDataset + +# forming dataframe and printing +df = pd.read_csv(senCSVPath, dtype=(object)) + +# forming ProfileReport and save +# as output.html file +profileAll = pp.ProfileReport(df, minimal=True) +profileAll.to_file("data/OUT/profiles/AllTweets.html") + +df = pd.read_csv(senCSVcCovPath, dtype=(object)) + +profileAll = pp.ProfileReport(df, minimal=True) +profileAll.to_file("data/OUT/profiles/CovTweets.html") \ No newline at end of file