From 244fb264377b18ec193fc16439e1072553088a7e Mon Sep 17 00:00:00 2001
From: Nathan Tarr <nmtarr@ncsu.edu>
Date: Fri, 16 Feb 2024 14:57:15 -0500
Subject: [PATCH] Parse eventDate to allow 2023-05-02T11:56

---
 wrangler_functions.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/wrangler_functions.py b/wrangler_functions.py
index 290dcfa..a5fcad8 100644
--- a/wrangler_functions.py
+++ b/wrangler_functions.py
@@ -831,11 +831,6 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se
     print("Prepared data frames for processing: "
           + str(datetime.now() - timestamp))
 
-    # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  STANDARDIZE DATE FORMAT
-    # EventDate could be formatted in several ways: month/day/year, 
-    #   year-month-day, or "%Y-%m-%dT%H:%M:%S", or maybe others.
-
-
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  SUMMARIZE VALUES
     timestamp = datetime.now()
     # Make a list of columns to summarize values from
@@ -897,10 +892,22 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se
     df_unfiltered["detection_distance_m"] = taxon_info["detection_distance_m"]
     df_unfiltered["filter_set_name"] = filter_set["name"]
 
+    # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  PARSE EVENT DATE COLUMN
+    # Note: This may drop some records with unsupported eventDate formats
+    timestamp = datetime.now()
+    df_unfiltered["eventDate"] = [parse_eventDate(x) 
+                                  for x in df_unfiltered["eventDate"]]
+
+    # Drop records with pd.NA in eventDate
+    df_unfiltered = df_unfiltered.copy().dropna(subset=["eventDate"])
+
+    print("Parsed event dates: " + str(datetime.now() - timestamp))
+
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  COORDINATE PRECISION
     '''In WGS84, coordinate precision is limited by longitude and varies across
     latitudes and number of digits provided.  Thus, coordinates have a nominal
     precision that may limit values.   Populate a column for this...'''
+    timestamp = datetime.now()
 
     # Trim decimal length to 5 digits (lat and long).
     #    Anything more is false precision.
@@ -931,6 +938,8 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se
     df_unfiltered.drop(["temp", "temp2", "digits_latitude", "digits_longitude",
                         "nominal_x_precision", "nominal_y_precision"], axis=1,
                        inplace=True)
+    print("Calculated nominal coordinate precision: "
+          + str(datetime.now() - timestamp))
 
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< BUFFER RADIUS
     '''
@@ -950,14 +959,14 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se
     georef = df_unfiltered[df_unfiltered["coordinateUncertaintyInMeters"] > 0.0].copy()
     if georef.empty == False:
         #georef.fillna({"coordinatePrecision": 0.00001}, inplace=True)
-        georef["gps_accuracy_m"] = np.where(georef["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").year) < 2000, 100, 30)
+        georef["gps_accuracy_m"] = np.where(georef["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year) < 2000, 100, 30)
         georef["radius_m"] = georef["coordinateUncertaintyInMeters"]
         print("Number of georeferenced GBIF records: " + str(len(georef)))
 
     # Records from GBIF without coordinate uncertainty
     gbif_nogeo = df_unfiltered[(df_unfiltered["coordinateUncertaintyInMeters"] == 0.0) & (df_unfiltered["collectionCode"].str.contains("EBIRD*") == False)].copy()
     if gbif_nogeo.empty == False:
-        gbif_nogeo["gps_accuracy_m"] = np.where(gbif_nogeo["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").year) < 2000, 100, 30)
+        gbif_nogeo["gps_accuracy_m"] = np.where(gbif_nogeo["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year) < 2000, 100, 30)
         if filter_set["default_coordUncertainty"] is not None:
             print("Applying default coordinate uncertainties for GBIF records")
             #gbif_nogeo.fillna({"coordinatePrecision": 0.00001}, inplace=True)
@@ -1008,6 +1017,7 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se
         print("Prepared records and calculated radii:" + str(datetime.now() - timestamp))
 
     # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  FILTER
+    # Note: Some records may have been dropped above during date parsing.
     timestamp = datetime.now()
     # Some filters to be prepped for use
     for x in ['bases_omit', 'collection_codes_omit', 'datasets_omit',
@@ -1484,10 +1494,11 @@ def parse_eventDate(x: str) -> str:
 
     Function to reformat dates from GBIF to "YYYY-MM-DD".  Returns pd.NA if the
     date is not in one of the following recognized formats: "YYYY-MM-DD",
-    "YYYY-MM-DDTHH:MM:SS", "YYYY-MM-DDTHH:MM:SSZ", "MM/DD/YYYY".  The function
-    will also print a message to the console if it encounters a date in an
-    unsupported format.  Records assigned to datetime ranges are assessed
-    and will return pd.NA unless the range was within one day.  
+    "YYYY-MM-DDTHH:MM:SS", "YYYY-MM-DDTHH:MM:SSZ", "YYYY-MM-DDTHH:MM", 
+    "YYYY-MM-DDTHH:MMZ", "MM/DD/YYYY".  The function will also print a message 
+    to the console if it encounters a date in an unsupported format.  Records 
+    assigned to datetime ranges are assessed and will return pd.NA unless the 
+    range was within one day.  
     
     See "https://techdocs.gbif.org/en/data-processing/temporal-interpretation"
 
@@ -1515,6 +1526,12 @@ def parse_eventDate(x: str) -> str:
         # If the date is in the format "YYYY-MM-DDTHH:MM:SSZ", return just the date
         elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', x) and len(x) == 20:
             return x.split('T')[0]
+        # If the date is in the format "YYYY-MM-DDTHH:MM", return just the date
+        elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}', x) and len(x) == 16:
+            return x.split('T')[0]
+        # If the date is in the format "YYYY-MM-DDTHH:MMZ", return just the date
+        elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}Z', x) and len(x) == 17:
+            return x.split('T')[0]
         # If the date is in the format "MM/DD/YYYY", return it in the format 
         # "YYYY-MM-DD"
         elif re.match(r'\d{2}/\d{2}/\d{4}', x) and len(x) == 10:
-- 
GitLab