changes folder structure of in- and output files

adds comments. changes logfile format to .log
adds docstrings to functions. adds several comments.
2023-06-23 20:39:40 +02:00 · 2023-06-23 20:34:46 +02:00 · 2023-06-23 20:26:16 +02:00
7 changed files with 111 additions and 216 deletions
--- a/collect.py
+++ b/collect.py
@ -4,9 +4,9 @@ Created on Thu Jun  8 01:08:21 2023
@author: Michael
 # https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
 Following files are necessary:
    config.py
        Used to configure everything that's needed for this script.
    funs/TimeSlice.py
        Function get_Tslices slices the defined timespan in config.py into N 
        slices. Is necessary due to possible blocking of requests by twitter. 
@ -15,6 +15,8 @@ Following files are necessary:
        Function deDupe reads each line of inFile and removes duplicate lines.
        A file outFile is saved without the duplicate lines. Generates 
        "keywords.txt".
    funs/Scrape.py
        scrapes using snscrape.modules.twitter. See docstring.
    data/keywords-raw.txt
        Contains all keywords that are used to detect whether a tweet contains
        information about Covid19.
@ -60,21 +62,30 @@ import sys
 from datetime import datetime
 import concurrent.futures
-## Setup directories
+###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
-td = "data/tweets/"
+td = "data/OUT/"
 # Name of file that all tweets will be written to
 file_alltweets = "ALL-SENATORS-TWEETS.csv"
 # don't change this one
 path_to_tweetdfs = wd + td
-## Define Timespan
+# Name of logfile
 logfile = wd+"log/log_"
 ###################
 # Define Timespan & time-format
 # Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
 ts_beg = "2020-01-01T00:00:00Z"  # start of scraping
 ts_end = "2023-01-03T00:00:00Z"  # end of straping
@ -86,10 +97,8 @@ fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 # Maximum tweets to be scraped by snscrape. Can be left untouched.
 maxTweets = 5000
-# Name of logfile
+###################
-logfile = wd+"log/log_"
+# Install snscrape from local git repo to make shure that it fits the used version.
 ## Install snscrape from local git repo to make shure that it fits the used version.
 # If snscrape is already installed, uncomment the following lines:
 """ 
 import subprocess
@ -98,7 +107,9 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
 os.chdir(wd) 
 """
-# Columns for tweet dataframe
+# Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet:
 # https://thetechrobo.ca/snscrape-docs/_autosummary/snscrape.modules.twitter.Tweet.html
 #   get subparams just like in user where user id can be obtained by user.id 
 tweetDFColumns = [
    "id",
    "user.id",
@ -135,18 +146,22 @@ tweetDFColumns = [
    "source",
 ]
-## Import other files
+## Import functions
 from funs.TimeSlice import *
 from funs.ClearDupes import deDupe
 from funs.Scrape import scrapeTweets
-# create logfile & log all outputs
+################### 
-logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".txt"
+# Create logfile & log all outputs
-logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".txt"
+#   there are three logfile types to be found in /log.
 #   should be self explanatory.
 logfilen = logfile + datetime.now().strftime(fTimeFormat) + ".log"
 logfileErrors = logfile + datetime.now().strftime(fTimeFormat) + "_err" + ".log"
 sys.stderr = open(logfileErrors, "w")
 sys.stdout = open(logfilen, "w")
-## Create List of time-period-slices
+###################
 # Create List of time-period-slices
 time_slices = get_Tslices(ts_beg, ts_end, no_slices)
 # Print slices
 print("Time-period-slices:")
@ -154,19 +169,22 @@ for slice in time_slices:
    print(slice["suffix"] + ": " + slice["beg_time"] + " - " + slice["end_time"])
 print("---")
-## Keywords
+###################
 # Keywords
 # read keywords from a file and write to list.
 keywords = []
 # Remove duplicate Keywords and save all non-duplicates to 'data/keywords.txt'
-deDupe("data/keywords-raw.txt", "data/keywords.txt")
+deDupe(f"{di}keywords-raw.txt", f"{di}keywords.txt")
 # Read the keywords from a file
-with open("data/keywords.txt", "r") as file:
+with open(f"{di}keywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        keyword = line.strip()  # Remove the newline character
        keywords.append(keyword)
 print("---")
-## Senator Accounts
+###################
 # Senator Accounts
 # Get accounts & alt-accounts from Senators-Datafile
 accounts = pd.read_csv("data/senators-raw.csv")["twitter_handle"].tolist()
 alt_accounts = pd.read_csv("data/senators-raw.csv")["alt_handle"].tolist()
@ -181,52 +199,61 @@ for i, acc in enumerate(accounts): # print 5 accounts per line
        print("\n")
 print(f"\n{i} accounts in total.\n---")
-## Scraping
+###################
 # Scraping
 # report time:
 timeStartScrape = datetime.now()
 print("Starting scraping at:")
 print(timeStartScrape.strftime(fTimeFormat))
 print("---")
 # Iterate over each Twitter account using multiprocessing
-# with concurrent.futures.ProcessPoolExecutor() as executor:
+with concurrent.futures.ProcessPoolExecutor() as executor:
-#     # List to store the scraping tasks
+    # List to store the scraping tasks
-#     tasks = []
+    tasks = []
-#     for handle in accounts:
+    for handle in accounts:
-#         # Iterate over each time slice
+        # Iterate over each time slice
-#         for slice_data in time_slices:
+        for slice_data in time_slices:
-#             # ... Code to prepare the slice_data ...
+            # ... Code to prepare the slice_data ...
-#             # Schedule the scraping task
+            # Schedule the scraping task
-#             task = executor.submit(
+            task = executor.submit(
-#                 scrapeTweets, handle, slice_data, keywords, td, tweetDFColumns
+                scrapeTweets, handle, keywords, td, tweetDFColumns, slice_data['beg_time'], slice_data['end_time'], slice_data['suffix'] 
-#             )
+            )
-#             # Store the handle and slice_data as attributes of the task
+            # Store the handle and slice_data as attributes of the task
-#     # Wait for all tasks to complete
+    # Wait for all tasks to complete
-#     concurrent.futures.wait(tasks)
+    concurrent.futures.wait(tasks)
-    
+
 # report time:
 timeEndScrape = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndScrape.strftime(fTimeFormat))
-## Merge CSV-Files to file_alltweets.
+###################
-# fastest way is to save the slices seperately and then add every file to the output instead of using pandas or anything else.
+# Merge CSV-Files to file_alltweets.
-os.chdir(path_to_tweetdfs)
+# fastest way is to save the slices seperately and then add every file to the 
-# At first check, whether all slices are present.
+# output instead of using pandas or anything else.
-tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv")
+os.chdir(path_to_tweetdfs) # change dir to use glob to get list of csv-files in dir
 ## At first check, whether all slices are present.
 tweetfiles = glob.glob("*.csv")  # get list of all csv files in folder - before: "*.{}".format("csv") ???
 # Create list of all files that should be in the folder:
 AllFilesList = []
 for handle in accounts:
    for tslice in time_slices:
        suffix = tslice['suffix']
-        AllFilesList.append(f"Tweets-{handle}{suffix}.csv")
+        AllFilesList.append(f"Tweets-{handle}{suffix}.csv") 
-with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.txt", "w") as fout:
+# report missing files to "log_*_missing.txt"
 with open(f"{logfile}"+timeStartScrape.strftime(fTimeFormat)+"_missing.log", "w") as fout:
    for file in AllFilesList:
        if file not in tweetfiles:
            fout.write(f'Missing: {file}.\n') # if file is not in tweetfiles, print error message.
        else:
            fout.write('all slices scraped.')
-
+## Merge .csv files.
-# check if file_alltweets (previously scraped tweets that have been merged into one file) exists, if it exists, remove from list to not include it in the following merge
+# check if file_alltweets (previously scraped tweets that have been merged 
 # into one file) exists in tweetfiles list, if it exists, remove from list 
 # to not include it in the following merge
 if file_alltweets in tweetfiles:
    tweetfiles.remove(file_alltweets)
 # Go through all csv files and merge them into file_alltweets
@ -240,21 +267,24 @@ if tweetfiles:
            with open(file, "rb") as f:
                next(f)  # skip the header
                fout.write(f.read())
-os.chdir(wd)
+os.chdir(wd) # go back to wd
 # Report timing info.
 timeEndMerge = datetime.now()
 print("---")
 print("End of scraping at:")
 print(timeEndMerge.strftime(fTimeFormat))
 print("---")
-tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape)
+# calulate times:
-tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape)
+tThours, tTminutes, tTseconds = convertTime(timeEndMerge - timeStartScrape) # total execution time
-tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape)
+tShours, tSminutes, tSseconds = convertTime(timeEndScrape - timeStartScrape) # scraping time
 tMhours, tMminutes, tMseconds = convertTime(timeEndMerge - timeEndScrape) # merge time
 print(
    f"Total execution time: {tThours} hours, {tTminutes} minutes and {tTseconds} seconds"
 )
 print(f"Scraping time: {tShours} hours, {tSminutes} minutes and {tSseconds} seconds")
 print(f"Time merging: {tMhours} hours, {tMminutes} minutes and {tMseconds} seconds")
 # close connection to logfiles.
 sys.stdout.close()
 sys.stderr.close()
--- a/data/IN/keywords-raw.txt
+++ b/data/IN/keywords-raw.txt
--- a/data/IN/senators-raw.csv
+++ b/data/IN/senators-raw.csv
--- a/data/keywords.txt
+++ b/data/keywords.txt
@ -1,140 +0,0 @@
 Coronavirus
 Koronavirus
 Corona
 CDC
 Wuhancoronavirus
 Wuhanlockdown
 Ncov
 Wuhan
 N95
 Kungflu 
 Epidemic
 outbreak
 Sinophobia
 China
 covid-19
 corona virus
 covid
 covid19
 sars-cov-2
 COVIDー19
 COVD
 pandemic
 coronapocalypse
 canceleverything
 Coronials
 SocialDistancingNow
 Social Distancing
 SocialDistancing
 panicbuy
 panic buy
 panicbuying
 panic buying
 14DayQuarantine
 DuringMy14DayQuarantine       
 panic shop
 panic shopping
 panicshop
 InMyQuarantineSurvivalKit
 panic-buy
 panic-shop
 coronakindness
 quarantinelife
 chinese virus
 chinesevirus
 stayhomechallenge
 stay home challenge
 sflockdown
 DontBeASpreader
 lockdown
 lock down
 shelteringinplace
 sheltering in place
 staysafestayhome 
 stay safe stay home
 trumppandemic
 trump pandemic
 flattenthecurve 
 flatten the curve
 china virus
 chinavirus
 quarentinelife
 PPEshortage
 saferathome
 stayathome
 stay at home
 stay home
 stayhome
 GetMePPE
 covidiot
 epitwitter
 pandemie
 wear a mask
 wearamask
 kung flu
 covididiot
 COVID__19
 omicron
 variant 
 vaccine
 travel ban
 corona
 coronavirus
 sarscov2
 sars cov2
 sars cov 2
 covid_19
 ncov
 ncov2019
 2019-ncov
 pandemic 2019ncov
 2019ncov
 quarantine
 flattening the curve
 flatteningthecurve
 flattenthecurve
 hand sanitizer
 handsanitizer
 social distancing
 socialdistancing
 work from home
 workfromhome
 working from home
 workingfromhome
 ppe
 n95
 covidiots
 herd immunity
 herdimmunity
 pneumonia
 wuhan virus
 wuhanvirus
 kungflu
 vaccines
 corona vaccine
 corona vaccines
 coronavaccine
 coronavaccines
 face shield
 faceshield
 face shields
 faceshields
 health worker
 healthworker
 health workers
 healthworkers
 stayhomestaysafe
 coronaupdate
 frontlineheroes
 coronawarriors
 homeschool
 homeschooling
 hometasking
 masks4all
 wfh
 wash ur hands
 wash your hands
 washurhands
 washyourhands
 selfisolating
 self isolating
--- a/data/tweets/.gitignore
+++ b/data/tweets/.gitignore
@ -1,24 +0,0 @@
 /ALL-SENATORS-LONG-LONG.csv
 /ALL-SENATORS.csv
 /CoryGardner-LONG.csv
 /CoryGardner.csv
 /DavidPerdueGA-LONG.csv
 /DavidPerdueGA.csv
 /DougJones-LONG.csv
 /DougJones.csv
 /KLoeffler-LONG.csv
 /KLoeffler.csv
 /MarthaMcSallyAZ-LONG.csv
 /MarthaMcSallyAZ.csv
 /SenAlexander-LONG.csv
 /SenAlexander.csv
 /SenPatRoberts-LONG.csv
 /SenPatRoberts.csv
 /SenatorEnzi-LONG.csv
 /SenatorEnzi.csv
 /SenatorIsakson-LONG.csv
 /SenatorIsakson.csv
 /SenatorTomUdall-LONG.csv
 /SenatorTomUdall.csv
 /VP-LONG.csv
 /VP.csv
--- a/funs/Scrape.py
+++ b/funs/Scrape.py
@ -3,13 +3,22 @@ import time
 import pandas as pd
 import snscrape.modules.twitter as sntwitter
-def scrapeTweets(handle, slice_data, keywords, td, tweetDFColumns, maxTweets = 5000):
+def scrapeTweets(handle, keywords, td, tweetDFColumns, ts_beg, ts_end, suffix,  maxTweets = 5000):
    """Scrapes tweets from a specific account in a specific time span using snscrape.modules.twitter.
    Args:
        handle (str): twitter handle of account to be scraped
        keywords (list): list of strings containing the keywords that the tweets shall be searched for
        td (str): tweet file output path
        tweetDFColumns (list): Columns for tweet dataframe. Parameters for snscrape.modules.twitter.Tweet
        ts_beg (str): scrape from ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
        ts_end (_type_): scrape until ... YYYY-MM-DDTHH:MM:SSZ from datetime: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
        suffix (str): suffix that shall be added to filename after the handle. Example: "-slice1" of handle "handle" will produce the file "Tweets-handle-slice1.csv"
        maxTweets (int, optional): Maximum number of tweets to be scraped. Defaults to 5000.
    """
    i = 0
    currentTime = datetime.now()
    ts_beg = slice_data['beg_time']
    ts_end = slice_data['end_time']
    suffix = slice_data['suffix']
    tweetDataFilePath = td + f"Tweets-{handle}{suffix}.csv"
    # create empty tweetlist that will be filled with tweets of current sen
--- a/funs/TimeSlice.py
+++ b/funs/TimeSlice.py
@ -8,6 +8,16 @@ Created on Wed Jun 21 13:58:42 2023
 # create slices 
 def get_Tslices(ts_beg, ts_end, no_slices):
    """Splits the time-period between two points in time into #no_slices and returns start and end time of each slice period.
    Args:
        ts_beg (datetime): Datetime start of overall period to be sliced.
        ts_end (datetime): Datetime end of overall period to be sliced.
        no_slices (int): number of slices. 24 e.g. will produce 24 start and end dates each.
    Returns:
        list[dict[str:datetime|str]]: One dict for each containing 'beg_time' 'end_time' and 'suffix' (e.g. -slice1)
    """
    from datetime import datetime
    from datetime import timedelta
    ts_beg = datetime.strptime(ts_beg, '%Y-%m-%dT%H:%M:%SZ')
@ -25,6 +35,16 @@ def get_Tslices(ts_beg, ts_end, no_slices):
 # For log time conversions (seconds to days, hours, minutes)
 def convertTime(duration):
    """Converts seconds to hours, minutes and seconds.
    Args:
        duration (int): seconds
    Returns:
        int: hours
        int: minutes
        int: seconds
    """
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
Author	SHA1	Message	Date
Michael Beck	27746cd886	changes folder structure of in- and output files	2023-06-23 20:39:40 +02:00
Michael Beck	02c3d055bd	adds comments. changes logfile format to .log	2023-06-23 20:34:46 +02:00
Michael Beck	dc2e17cc2f	adds docstrings to functions. adds several comments.	2023-06-23 20:26:16 +02:00