changes folder structure of in- and output files

adds comments. changes logfile format to .log
adds docstrings to functions. adds several comments.
2023-06-23 20:39:40 +02:00 · 2023-06-23 20:34:46 +02:00 · 2023-06-23 20:26:16 +02:00
7 changed files with 111 additions and 216 deletions
--- a/collect.py
+++ b/collect.py
@ -4,9 +4,9 @@ Created on Thu Jun  8 01:08:21 2023

@author: Michael

+# https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
+
 Following files are necessary:
-    config.py
-        Used to configure everything that's needed for this script.
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N 
        slices. Is necessary due to possible blocking of requests by twitter. 
@ -15,6 +15,8 @@ Following files are necessary:
        Function deDupe reads each line of inFile and removes duplicate lines.
        A file outFile is saved without the duplicate lines. Generates 
        "keywords.txt".
+    funs/Scrape.py
+        scrapes using snscrape.modules.twitter. See docstring.
    data/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
@ -60,21 +62,30 @@ import sys
 from datetime import datetime
 import concurrent.futures

-## Setup directories
+###################
+# Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'

+# datafile input directory
+di = "data/IN/"
+
 # Tweet-datafile output directory
-td = "data/tweets/"
+td = "data/OUT/"

 # Name of file that all tweets will be written to
 file_alltweets = "ALL-SENATORS-TWEETS.csv"

+# don't change this one
 path_to_tweetdfs = wd + td

-## Define Timespan
+# Name of logfile
+logfile = wd+"log/log_"
+
+###################
+# Define Timespan & time-format
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
@ -86,10 +97,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000

-# Name of logfile
-logfile = wd+"log/log_"
-
-## Install snscrape from local git repo to make shure that it fits the used version.
+###################
+# Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
 """ 
 import subprocess
@ -98,7 +107,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
 """

-# Columns for tweet dataframe
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
+# https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
+#   get subparams just like in user where user id can be obtained by user.id 
 tweetDFColumns = [
    "id",
    "user.id",
@ -135,18 +146,22 @@ tweetDFColumns = [
    "source",
 ]

-## Import other files
+## Import functions
 from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets

-# create logfile & log all outputs
-logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
-logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
+################### 
+# Create logfile & log all outputs
+#   there are three logfile types to be found in /log.
+#   should be self explanatory.
+logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
+logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")

-## Create List of time-period-slices
+###################
+# Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
 print("Time-period-slices:")
@ -154,19 +169,22 @@ for slice in time_slices:
    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
 print("---")

-## Keywords
+###################
+# Keywords
+# read keywords from a file and write to list.
 keywords = []
 # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
-deDupe("data/keywords-raw.txt", "data/keywords.txt")
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
 # Read the keywords from a file
-with open("data/keywords.txt", "r") as file:
+with open(f"{di}keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
 print("---")

-## Senator Accounts
+###################
+# Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
@ -181,52 +199,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
        print("\n")
 print(f"\n{i} accounts in total.\n---")

-## Scraping
+###################
+# Scraping
+# report time:
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
 print(timeStartScrape.strftime(fTimeFormat))
 print("---")

 # Iterate over each Twitter account using multiprocessing
-# with concurrent.futures.ProcessPoolExecutor() as executor:
-#     # List to store the scraping tasks
-#     tasks = []
-#     for handle in accounts:
-#         # Iterate over each time slice
-#         for slice_data in time_slices:
-#             # ... Code to prepare the slice_data ...
-#             # Schedule the scraping task
-#             task = executor.submit(
-#                 scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
-#             )
-#             # Store the handle and slice_data as attributes of the task
-#     # Wait for all tasks to complete
-#     concurrent.futures.wait(tasks)
-    
+with concurrent.futures.ProcessPoolExecutor() as executor:
+    # List to store the scraping tasks
+    tasks = []
+    for handle in accounts:
+        # Iterate over each time slice
+        for slice_data in time_slices:
+            # ... Code to prepare the slice_data ...
+            # Schedule the scraping task
+            task = executor.submit(
+                scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix'] 
+            )
+            # Store the handle and slice_data as attributes of the task
+    # Wait for all tasks to complete
+    concurrent.futures.wait(tasks)
+
+# report time:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime(fTimeFormat))

-## Merge CSV-Files to file_alltweets.
-# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
-os.chdir(path_to_tweetdfs)
-# At first check, whether all slices are present.
-tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
+###################
+# Merge CSV-Files to file_alltweets.
+# fastest way is to save the slices seperately and then add every file to the 
+# output instead of using pandas or anything else.
+os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
+## At first check, whether all slices are present.
+tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") ???
+# Create list of all files that should be in the folder:
 AllFilesList = []
 for handle in accounts:
    for tslice in time_slices:
        suffix = tslice['suffix']
-        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
-with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
+        AllFilesList.append(f"Tweets-{handle}{suffix}.csv") 
+# report missing files to "log_*_missing.txt"
+with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
    for file in AllFilesList:
        if file not in tweetfiles:
            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
        else:
            fout.write('all slices scraped.')
            
-
-# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
+## Merge .csv files.
+# check if file_alltweets (previously scraped tweets that have been merged 
+# into one file) exists in tweetfiles list, if it exists, remove from list 
+# to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
@ -240,21 +267,24 @@ if tweetfiles:
            with open(file, "rb") as f:
                next(f)  # skip the header
                fout.write(f.read())
-os.chdir(wd)
+os.chdir(wd) # go back to wd

+# Report timing info.
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndMerge.strftime(fTimeFormat))
 print("---")
-tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
-tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
-tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
+# calulate times:
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
+tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
 print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 )
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")

+# close connection to logfiles.
 sys.stdout.close()
 sys.stderr.close()
--- a/data/IN/keywords-raw.txt
+++ b/data/IN/keywords-raw.txt
--- a/data/IN/senators-raw.csv
+++ b/data/IN/senators-raw.csv
--- a/data/keywords.txt
+++ b/data/keywords.txt
@ -1,140 +0,0 @@
-Coronavirus
-Koronavirus
-Corona
-CDC
-Wuhancoronavirus
-Wuhanlockdown
-Ncov
-Wuhan
-N95
-Kungflu 
-Epidemic
-outbreak
-Sinophobia
-China
-covid-19
-corona virus
-covid
-covid19
-sars-cov-2
-COVIDー19
-COVD
-pandemic
-coronapocalypse
-canceleverything
-Coronials
-SocialDistancingNow
-Social Distancing
-SocialDistancing
-panicbuy
-panic buy
-panicbuying
-panic buying
-14DayQuarantine
-DuringMy14DayQuarantine       
-panic shop
-panic shopping
-panicshop
-InMyQuarantineSurvivalKit
-panic-buy
-panic-shop
-coronakindness
-quarantinelife
-chinese virus
-chinesevirus
-stayhomechallenge
-stay home challenge
-sflockdown
-DontBeASpreader
-lockdown
-lock down
-shelteringinplace
-sheltering in place
-staysafestayhome 
-stay safe stay home
-trumppandemic
-trump pandemic
-flattenthecurve 
-flatten the curve
-china virus
-chinavirus
-quarentinelife
-PPEshortage
-saferathome
-stayathome
-stay at home
-stay home
-stayhome
-GetMePPE
-covidiot
-epitwitter
-pandemie
-wear a mask
-wearamask
-kung flu
-covididiot
-COVID__19
-omicron
-variant 
-vaccine
-travel ban
-corona
-coronavirus
-sarscov2
-sars cov2
-sars cov 2
-covid_19
-ncov
-ncov2019
-2019-ncov
-pandemic 2019ncov
-2019ncov
-quarantine
-flattening the curve
-flatteningthecurve
-flattenthecurve
-hand sanitizer
-handsanitizer
-social distancing
-socialdistancing
-work from home
-workfromhome
-working from home
-workingfromhome
-ppe
-n95
-covidiots
-herd immunity
-herdimmunity
-pneumonia
-wuhan virus
-wuhanvirus
-kungflu
-vaccines
-corona vaccine
-corona vaccines
-coronavaccine
-coronavaccines
-face shield
-faceshield
-face shields
-faceshields
-health worker
-healthworker
-health workers
-healthworkers
-stayhomestaysafe
-coronaupdate
-frontlineheroes
-coronawarriors
-homeschool
-homeschooling
-hometasking
-masks4all
-wfh
-wash ur hands
-wash your hands
-washurhands
-washyourhands
-selfisolating
-self isolating
--- a/data/tweets/.gitignore
+++ b/data/tweets/.gitignore
@ -1,24 +0,0 @@
-/ALL-SENATORS-LONG-LONG.csv
-/ALL-SENATORS.csv
-/CoryGardner-LONG.csv
-/CoryGardner.csv
-/DavidPerdueGA-LONG.csv
-/DavidPerdueGA.csv
-/DougJones-LONG.csv
-/DougJones.csv
-/KLoeffler-LONG.csv
-/KLoeffler.csv
-/MarthaMcSallyAZ-LONG.csv
-/MarthaMcSallyAZ.csv
-/SenAlexander-LONG.csv
-/SenAlexander.csv
-/SenPatRoberts-LONG.csv
-/SenPatRoberts.csv
-/SenatorEnzi-LONG.csv
-/SenatorEnzi.csv
-/SenatorIsakson-LONG.csv
-/SenatorIsakson.csv
-/SenatorTomUdall-LONG.csv
-/SenatorTomUdall.csv
-/VP-LONG.csv
-/VP.csv
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@ -3,13 +3,22 @@ import time
 import pandas as pd
 import snscrape.modules.twitter as sntwitter

-def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
+def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,  maxTweets = 5000):
+    """Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
+
+    Args:
+        handle (str): twitter handle of account to be scraped
+        keywords (list): list of strings containing the keywords that the tweets shall be searched for
+        td (str): tweet file output path
+        tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
+        ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+        ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+        suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
+        maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
+    """
    i = 0
    
    currentTime = datetime.now()
-    ts_beg = slice_data['beg_time']
-    ts_end = slice_data['end_time']
-    suffix = slice_data['suffix']
    tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
    
    # create empty tweetlist that will be filled with tweets of current sen
--- a/funs/TimeSlice.py
+++ b/funs/TimeSlice.py
@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023

 # create slices 
 def get_Tslices(ts_beg, ts_end, no_slices):
+    """Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
+
+    Args:
+        ts_beg (datetime): Datetime start of overall period to be sliced.
+        ts_end (datetime): Datetime end of overall period to be sliced.
+        no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
+
+    Returns:
+        list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
+    """
    from datetime import datetime
    from datetime import timedelta
    ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):

 # For log time conversions (seconds to days, hours, minutes)
 def convertTime(duration):
+    """Converts seconds to hours, minutes and seconds.
+
+    Args:
+        duration (int): seconds
+
+    Returns:
+        int: hours
+        int: minutes
+        int: seconds
+    """
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
Author	SHA1	Message	Date
Michael Beck	27746cd886	changes folder structure of in- and output files	2023-06-23 20:39:40 +02:00
Michael Beck	02c3d055bd	adds comments. changes logfile format to .log	2023-06-23 20:34:46 +02:00
Michael Beck	dc2e17cc2f	adds docstrings to functions. adds several comments.	2023-06-23 20:26:16 +02:00