From 244fb264377b18ec193fc16439e1072553088a7e Mon Sep 17 00:00:00 2001 From: Nathan Tarr <nmtarr@ncsu.edu> Date: Fri, 16 Feb 2024 14:57:15 -0500 Subject: [PATCH] Parse eventDate to allow 2023-05-02T11:56 --- wrangler_functions.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/wrangler_functions.py b/wrangler_functions.py index 290dcfa..a5fcad8 100644 --- a/wrangler_functions.py +++ b/wrangler_functions.py @@ -831,11 +831,6 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se print("Prepared data frames for processing: " + str(datetime.now() - timestamp)) - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< STANDARDIZE DATE FORMAT - # EventDate could be formatted in several ways: month/day/year, - # year-month-day, or "%Y-%m-%dT%H:%M:%S", or maybe others. - - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< SUMMARIZE VALUES timestamp = datetime.now() # Make a list of columns to summarize values from @@ -897,10 +892,22 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se df_unfiltered["detection_distance_m"] = taxon_info["detection_distance_m"] df_unfiltered["filter_set_name"] = filter_set["name"] + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< PARSE EVENT DATE COLUMN + # Note: This may drop some records with unsupported eventDate formats + timestamp = datetime.now() + df_unfiltered["eventDate"] = [parse_eventDate(x) + for x in df_unfiltered["eventDate"]] + + # Drop records with pd.NA in eventDate + df_unfiltered = df_unfiltered.copy().dropna(subset=["eventDate"]) + + print("Parsed event dates: " + str(datetime.now() - timestamp)) + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< COORDINATE PRECISION '''In WGS84, coordinate precision is limited by longitude and varies across latitudes and number of digits provided. Thus, coordinates have a nominal precision that may limit values. Populate a column for this...''' + timestamp = datetime.now() # Trim decimal length to 5 digits (lat and long). # Anything more is false precision. @@ -931,6 +938,8 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se df_unfiltered.drop(["temp", "temp2", "digits_latitude", "digits_longitude", "nominal_x_precision", "nominal_y_precision"], axis=1, inplace=True) + print("Calculated nominal coordinate precision: " + + str(datetime.now() - timestamp)) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< BUFFER RADIUS ''' @@ -950,14 +959,14 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se georef = df_unfiltered[df_unfiltered["coordinateUncertaintyInMeters"] > 0.0].copy() if georef.empty == False: #georef.fillna({"coordinatePrecision": 0.00001}, inplace=True) - georef["gps_accuracy_m"] = np.where(georef["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").year) < 2000, 100, 30) + georef["gps_accuracy_m"] = np.where(georef["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year) < 2000, 100, 30) georef["radius_m"] = georef["coordinateUncertaintyInMeters"] print("Number of georeferenced GBIF records: " + str(len(georef))) # Records from GBIF without coordinate uncertainty gbif_nogeo = df_unfiltered[(df_unfiltered["coordinateUncertaintyInMeters"] == 0.0) & (df_unfiltered["collectionCode"].str.contains("EBIRD*") == False)].copy() if gbif_nogeo.empty == False: - gbif_nogeo["gps_accuracy_m"] = np.where(gbif_nogeo["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").year) < 2000, 100, 30) + gbif_nogeo["gps_accuracy_m"] = np.where(gbif_nogeo["eventDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year) < 2000, 100, 30) if filter_set["default_coordUncertainty"] is not None: print("Applying default coordinate uncertainties for GBIF records") #gbif_nogeo.fillna({"coordinatePrecision": 0.00001}, inplace=True) @@ -1008,6 +1017,7 @@ def process_records(ebird_data: pd.DataFrame, gbif_data: pd.DataFrame, filter_se print("Prepared records and calculated radii:" + str(datetime.now() - timestamp)) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< FILTER + # Note: Some records may have been dropped above during date parsing. timestamp = datetime.now() # Some filters to be prepped for use for x in ['bases_omit', 'collection_codes_omit', 'datasets_omit', @@ -1484,10 +1494,11 @@ def parse_eventDate(x: str) -> str: Function to reformat dates from GBIF to "YYYY-MM-DD". Returns pd.NA if the date is not in one of the following recognized formats: "YYYY-MM-DD", - "YYYY-MM-DDTHH:MM:SS", "YYYY-MM-DDTHH:MM:SSZ", "MM/DD/YYYY". The function - will also print a message to the console if it encounters a date in an - unsupported format. Records assigned to datetime ranges are assessed - and will return pd.NA unless the range was within one day. + "YYYY-MM-DDTHH:MM:SS", "YYYY-MM-DDTHH:MM:SSZ", "YYYY-MM-DDTHH:MM", + "YYYY-MM-DDTHH:MMZ", "MM/DD/YYYY". The function will also print a message + to the console if it encounters a date in an unsupported format. Records + assigned to datetime ranges are assessed and will return pd.NA unless the + range was within one day. See "https://techdocs.gbif.org/en/data-processing/temporal-interpretation" @@ -1515,6 +1526,12 @@ def parse_eventDate(x: str) -> str: # If the date is in the format "YYYY-MM-DDTHH:MM:SSZ", return just the date elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', x) and len(x) == 20: return x.split('T')[0] + # If the date is in the format "YYYY-MM-DDTHH:MM", return just the date + elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}', x) and len(x) == 16: + return x.split('T')[0] + # If the date is in the format "YYYY-MM-DDTHH:MMZ", return just the date + elif re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}Z', x) and len(x) == 17: + return x.split('T')[0] # If the date is in the format "MM/DD/YYYY", return it in the format # "YYYY-MM-DD" elif re.match(r'\d{2}/\d{2}/\d{4}', x) and len(x) == 10: -- GitLab