From cfd231ce361bfdd7f78d30c6745dca0f03c6dc34 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Tue, 3 Dec 2024 13:49:18 -0800 Subject: [PATCH 01/30] poc --- geomagio/ImagCDFFactory.py | 753 +++++++++++++++++++++++++++++++++++++ poetry.lock | 19 + pyproject.toml | 1 + 3 files changed, 773 insertions(+) create mode 100644 geomagio/ImagCDFFactory.py diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py new file mode 100644 index 00000000..5d104acd --- /dev/null +++ b/geomagio/ImagCDFFactory.py @@ -0,0 +1,753 @@ +"""ImagCDFFactory Implementation Using cdflib + +This module provides the ImagCDFFactory class for creating and writing +geomagnetic time series data in the ImagCDF format using the cdflib library. +The ImagCDF format is based on NASA's Common Data Format (CDF) and is designed +to store geomagnetic data with high precision. + +References: +- ImagCDF Format Documentation: https://intermagnet.org/docs/technical/im_tn_8_ImagCDF.pdf +- CDF Library: http://cdf.gsfc.nasa.gov/ +""" + +from __future__ import absolute_import, print_function +from io import BytesIO +import os +import sys +from typing import List, Optional, Union + +import numpy as np +from obspy import Stream, Trace, UTCDateTime + +from geomagio.TimeseriesFactory import TimeseriesFactory + +from .geomag_types import DataInterval, DataType +from .TimeseriesFactoryException import TimeseriesFactoryException +from . import TimeseriesUtility +from . import Util + +import cdflib +import tempfile + + +class IMCDFPublicationLevel: + """Handles publication levels and mapping between data types and levels. + + The ImagCDF format uses publication levels to describe the processing + level of the data. This class maps data types (e.g., 'variation', 'definitive') + to their corresponding publication levels as defined in the ImagCDF documentation. + + Publication Levels: + 1: Raw data with no processing. + 2: Edited data with preliminary baselines. + 3: Data suitable for initial bulletins or quasi-definitive publication. + 4: Definitive data with no further changes expected. + + Reference: + - ImagCDF Documentation Section 4.2: Attributes that Uniquely Identify the Data + """ + + class PublicationLevel: + LEVEL_1 = "1" + LEVEL_2 = "2" + LEVEL_3 = "3" + LEVEL_4 = "4" + + TYPE_TO_LEVEL = { + "none": PublicationLevel.LEVEL_1, + "variation": PublicationLevel.LEVEL_1, + "reported": PublicationLevel.LEVEL_1, + "provisional": PublicationLevel.LEVEL_2, + "adjusted": PublicationLevel.LEVEL_2, + "quasi-definitive": PublicationLevel.LEVEL_3, + "quasidefinitive": PublicationLevel.LEVEL_3, + "definitive": PublicationLevel.LEVEL_4, + } + + def __init__(self, data_type: Optional[str] = None): + """Initialize with a data type to determine the publication level.""" + if data_type: + self.level = self.TYPE_TO_LEVEL.get(data_type.lower()) + else: + raise ValueError("data_type must be provided.") + + if not self.level: + raise ValueError(f"Unsupported data_type: {data_type}") + + def to_string(self) -> str: + """Return the publication level as a string.""" + return self.level + + +class ImagCDFFactory(TimeseriesFactory): + """Factory for creating and writing ImagCDF formatted CDF files. + + This class extends the TimeseriesFactory to support writing geomagnetic + time series data to files in the ImagCDF format using the cdflib library. + """ + + def __init__( + self, + observatory: Optional[str] = None, + channels: List[str] = ("H", "D", "Z", "F"), + type: DataType = "variation", + interval: DataInterval = "minute", + urlTemplate="file://{obs}_{dt}_{t}.cdf", + urlInterval: int = -1, + ): + """ + Initialize the ImagCDFFactory with default parameters. + + Parameters: + - observatory: IAGA code of the observatory (e.g., 'BOU'). + - channels: List of geomagnetic elements (e.g., ['H', 'D', 'Z', 'F']). + - type: Data type indicating the processing level (e.g., 'variation', 'definitive'). + - interval: Data interval (e.g., 'minute', 'second'). + - urlTemplate: Template for generating file URLs or paths. + - urlInterval: Interval size for splitting data into multiple files. + """ + super().__init__( + observatory=observatory, + channels=channels, + type=type, + interval=interval, + urlTemplate=urlTemplate, + urlInterval=urlInterval, + ) + + def parse_string(self, data: str, **kwargs): + """Parse ImagCDF formatted string data into a Stream. + + Note: Parsing from strings is not implemented in this factory. + """ + raise NotImplementedError('"parse_string" not implemented') + + def write_file(self, fh, timeseries: Stream, channels: List[str]): + """Write the timeseries data to a file handle in ImagCDF format. + + Parameters: + - fh: File handle to write the data. + - timeseries: ObsPy Stream containing the geomagnetic data. + - channels: List of channels to include in the output file. + """ + # Create a temporary file to write the CDF data + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + tmp_file_path = tmp_file.name + ".cdf" + + try: + # Initialize the CDF writer + cdf_writer = cdflib.cdfwrite.CDF(tmp_file_path, cdf_spec=None) + + # Write global attributes (metadata that applies to the entire file) + global_attrs = self._create_global_attributes(timeseries, channels) + cdf_writer.write_globalattrs(global_attrs) + + # Write time variables for each channel + time_vars = self._create_time_stamp_variables(timeseries) + for ts_name, ts_data in time_vars.items(): + var_spec = { + "Variable": ts_name, + "Data_Type": 33, # CDF TT2000 data type + "Num_Elements": 1, + "Rec_Vary": True, + "Dim_Sizes": [], + "Var_Type": "zVariable", + } + print(f"Writing time variable {ts_name} with data length: {len(ts_data)}") + cdf_writer.write_var( + var_spec=var_spec, + var_attrs=self._create_time_var_attrs(ts_name), + var_data=ts_data, + ) + + # Write geomagnetic data variables + for trace in timeseries: + channel = trace.stats.channel + var_name = f"GeomagneticField{channel}" + var_spec = { + "Variable": var_name, + "Data_Type": self._get_cdf_data_type(trace), + "Num_Elements": 1, + "Rec_Vary": True, + "Dim_Sizes": [], + "Var_Type": "zVariable", + } + print(f"Writing data variable {var_name} with data shape: {trace.data.shape}") + cdf_writer.write_var( + var_spec=var_spec, + var_attrs=self._create_var_attrs(trace), + var_data=trace.data, + ) + + # Copy the temporary CDF file to the final file handle + with open(tmp_file_path, "rb") as tmp: + cdf_data = tmp.read() + fh.write(cdf_data) + + cdf_writer.close() + + finally: + # Cleanup the temporary file + print(tmp_file_path) + + def put_timeseries( + self, + timeseries: Stream, + starttime: Optional[UTCDateTime] = None, + endtime: Optional[UTCDateTime] = None, + channels: Optional[List[str]] = None, + type: Optional[DataType] = None, + interval: Optional[DataInterval] = None, + ): + """ + Store timeseries data in ImagCDF format using cdflib. + + This method writes the timeseries data to one or more files, depending + on the specified urlInterval. + + Parameters: + - timeseries: ObsPy Stream containing the geomagnetic data. + - starttime: Start time of the data to be written. + - endtime: End time of the data to be written. + - channels: List of channels to include in the output file. + - type: Data type indicating the processing level. + - interval: Data interval of the data. + """ + if len(timeseries) == 0: + # No data to store + return + + channels = channels or self.channels + type = type or self.type + interval = interval or self.interval + + # Extract metadata from the first trace + stats = timeseries[0].stats + delta = stats.delta # Sample rate + observatory = stats.station + starttime = starttime or stats.starttime + endtime = endtime or stats.endtime + + # Split data into intervals if necessary + urlIntervals = Util.get_intervals( + starttime=starttime, endtime=endtime, size=self.urlInterval + ) + for urlInterval in urlIntervals: + interval_start = urlInterval["start"] + interval_end = urlInterval["end"] + if interval_start != interval_end: + interval_end = interval_end - delta + url = self._get_url( + observatory=observatory, + date=interval_start, + type=type, + interval=interval, + channels=channels, + ) + + # Handle 'stdout' output + if url == 'stdout': + # Write directly to stdout + fh = sys.stdout.buffer + url_data = timeseries.slice( + starttime=interval_start, + endtime=interval_end, + ) + self.write_file(fh, url_data, channels) + continue # Proceed to next interval if any + + # Handle 'file://' output + elif url.startswith('file://'): + # Get the file path from the URL + url_file = Util.get_file_from_url(url, createParentDirectory=False) + url_data = timeseries.slice( + starttime=interval_start, + endtime=interval_end, + ) + + # Check if the file already exists to merge data + if os.path.isfile(url_file): + try: + # Read existing data to merge with new data + existing_cdf = cdflib.cdfread.CDF(url_file) + existing_stream = self._read_cdf(existing_cdf) + existing_cdf.close() + existing_data = existing_stream + + # Merge existing data with new data + for trace in existing_data: + new_trace = url_data.select( + network=trace.stats.network, + station=trace.stats.station, + channel=trace.stats.channel, + ) + if new_trace: + trace.data = np.concatenate((trace.data, new_trace[0].data)) + url_data = existing_data + url_data + except Exception as e: + # Log the exception if needed + print(f"Warning: Could not read existing CDF file '{url_file}': {e}", file=sys.stderr) + # Proceed with new data + + # Pad the data with NaNs to ensure it fits the interval + url_data.trim( + starttime=interval_start, + endtime=interval_end, + nearest_sample=False, + pad=True, + fill_value=np.nan, + ) + + # Write the data to the CDF file + with open(url_file, "wb") as fh: + self.write_file(fh, url_data, channels) + + else: + # Unsupported URL scheme encountered + raise TimeseriesFactoryException("Unsupported URL scheme in urlTemplate") + + def _create_global_attributes(self, timeseries: Stream, channels: List[str]) -> dict: + """ + Create a dictionary of global attributes for the ImagCDF file. + + These attributes apply to all the data in the file and include metadata + such as observatory information, data publication level, and format + descriptions. + + References: + - ImagCDF Documentation Section 4: ImagCDF Global Attributes + """ + stats = timeseries[0].stats if len(timeseries) > 0 else None + + # Extract metadata from stats or fallback to defaults + observatory_name = getattr(stats, 'station_name', None) or self.observatory or "Unknown Observatory" + station = getattr(stats, 'station', None) or "Unknown Iaga Code" + institution = getattr(stats, 'agency_name', None) or "Unknown Institution" + latitude = getattr(stats, 'geodetic_latitude', None) or 0.0 + longitude = getattr(stats, 'geodetic_longitude', None) or 0.0 + elevation = getattr(stats, 'elevation', None) or 99_999.0 + vector_orientation = getattr(stats, 'sensor_orientation', None) or "" + data_interval_type = getattr(stats, 'data_interval_type', None) or self.interval + publication_level = IMCDFPublicationLevel(data_type=self.type).to_string() + global_attrs = { + 'FormatDescription': {0: 'INTERMAGNET CDF Format'}, + 'FormatVersion': {0: '1.2'}, + 'Title': {0: 'Geomagnetic time series data'}, + 'IagaCode': {0: station}, + 'ElementsRecorded': {0: ''.join(channels)}, + 'PublicationLevel': {0: publication_level}, + 'PublicationDate': {0: UTCDateTime.now().strftime("%Y-%m-%dT%H:%M:%SZ")}, + 'ObservatoryName': {0: observatory_name}, + 'Latitude': {0: latitude}, + 'Longitude': {0: longitude}, + 'Elevation': {0: elevation}, + 'Institution': {0: institution}, + 'VectorSensOrient': {0: vector_orientation}, #remove F - because its a calculation, not an element? + 'StandardLevel': {0: 'None'}, # Set to 'None' + # Temporarily Omit 'StandardName', 'StandardVersion', 'PartialStandDesc' + 'Source': {0: 'institute'}, # "institute" - if the named institution provided the data, “INTERMAGNET†- if the data file has been created by INTERMAGNET from another data source, “WDC†- if the World Data Centre has created the file from another data source + # 'TermsOfUse': {0: self.getINTERMAGNETTermsOfUse()}, + # 'UniqueIdentifier': {0: ''}, + # 'ParentIdentifiers': {0: ''}, + # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov + } + return global_attrs + + def _create_time_stamp_variables(self, timeseries: Stream) -> dict: + vector_times = None + scalar_times = None + + for trace in timeseries: + channel = trace.stats.channel + times = [ + (trace.stats.starttime + trace.stats.delta * i).datetime + for i in range(trace.stats.npts) + ] + # Convert timestamps to TT2000 format required by CDF + tt2000_times = cdflib.cdfepoch.timestamp_to_tt2000([time.timestamp() for time in times]) + # tt2000_times = cdflib.cdfepoch.compute_tt2000(times) #this does not work + + if channel in self._get_vector_elements(): + if vector_times is None: + vector_times = tt2000_times + else: + if not np.array_equal(vector_times, tt2000_times): + raise ValueError("Time stamps for vector channels are not the same.") + elif channel in self._get_scalar_elements(): + if scalar_times is None: + scalar_times = tt2000_times + else: + if not np.array_equal(scalar_times, tt2000_times): + raise ValueError("Time stamps for scalar channels are not the same.") + else: + # Handle other channels if necessary + pass + + time_vars = {} + if vector_times is not None: + time_vars['GeomagneticVectorTimes'] = vector_times + if scalar_times is not None: + time_vars['GeomagneticScalarTimes'] = scalar_times + + return time_vars + + + def _create_var_spec( + self, + var_name: str, + data_type: str, + num_elements: int, + var_type: str, + dim_sizes: List[int], + sparse: bool, + compress: int, + pad: Optional[Union[str, np.ndarray]], + ) -> dict: + """ + Create a variable specification dictionary for cdflib. + + This is used to define the properties of a variable when writing it to + the CDF file. + + Parameters: + - var_name: Name of the variable. + - data_type: CDF data type. + - num_elements: Number of elements per record. + - var_type: Variable type ('zVariable' or 'rVariable'). + - dim_sizes: Dimensions of the variable (empty list for 0D). + - sparse: Whether the variable uses sparse records. + - compress: Compression level. + - pad: Pad value for sparse records. + + Reference: + - CDF User's Guide: Variable Specification + """ + var_spec = { + 'Variable': var_name, + 'Data_Type': data_type, + 'Num_Elements': num_elements, + 'Rec_Vary': True, + 'Var_Type': var_type, + 'Dim_Sizes': dim_sizes, + 'Sparse': 'no_sparse' if not sparse else 'pad_sparse', + 'Compress': compress, + 'Pad': pad, + } + return var_spec + + def _create_var_attrs(self, trace: Trace) -> dict: + print(trace.stats) + channel = trace.stats.channel + fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. + units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†+ if channel == 'D': + units = 'Degrees of arc' + validmin = -360.0 + validmax = 360.0 # A full circle representation + elif channel == 'I': + units = 'Degrees of arc' + validmin = -90.0 + validmax = 90.0 #The magnetic field vector can point straight down (+90°), horizontal (0°), or straight up (-90°). + elif 'Temperature' in channel: + units = 'Celsius' + fieldnam = f"Temperature {trace.stats.location}" + elif channel == 'F': + units = 'nT' + validmin = 0.0 # negative magnetic field intestity not physically meaningful. + validmax = 79_999.0 + elif channel in ['X', 'Y', 'Z', 'H', 'E', 'V', 'G']: + units = 'nT' + validmin = -79_999.0 + validmax = 79_999.0 + + if channel in self._get_vector_elements(): + depend_0 = 'GeomagneticVectorTimes' + elif channel in self._get_scalar_elements(): + depend_0 = 'GeomagneticScalarTimes' + else: + depend_0 = None # Handle other cases if necessary + + + var_attrs = { + 'FIELDNAM': fieldnam, + 'UNITS': units, + 'FILLVAL': 99999.0, + 'VALIDMIN': validmin, + 'VALIDMAX': validmax, + 'DEPEND_0': depend_0, + 'DISPLAY_TYPE': 'time_series', + 'LABLAXIS': channel, + } + return var_attrs + + + def _create_time_var_attrs(self, ts_name: str) -> dict: + """ + Create a dictionary of time variable attributes. + + These attributes provide metadata for time variables. + Note: None of these attributes are required for the time stamp variables GeomagneticVectorTimes and GeomagneticScalarTimes. + Reference: + - ImagCDF Documentation Section 3: ImagCDF Data + """ + # var_attrs = { + # 'UNITS': 'TT2000', + # 'DISPLAY_TYPE': 'time_series', + # 'LABLAXIS': 'Time', + # } + # return var_attrs + return {} + + def _get_cdf_data_type(self, trace: Trace) -> int: + """ + Map ObsPy trace data type to CDF data type. + + Determines the appropriate CDF data type based on the NumPy data type + of the trace data. + + Returns: + - CDF_DOUBLE (45) for floating-point data. + - CDF_INT4 (41) for integer data. + + Reference: + - CDF Data Types: http://cdf.gsfc.nasa.gov/html/cdfdatatypes.html + """ + # CDF data type constants + CDF_DOUBLE = 45 # CDF_DOUBLE corresponds to 64-bit float + CDF_INT4 = 41 # CDF_INT4 corresponds to 32-bit int + + if trace.data.dtype in [np.float32, np.float64]: + return CDF_DOUBLE + elif trace.data.dtype in [np.int32, np.int64]: + return CDF_INT4 + else: + # Default to double precision float + return CDF_DOUBLE + + def _read_cdf(self, cdf: cdflib.cdfread.CDF) -> Stream: + """ + Read CDF data into an ObsPy Stream. + + This method reads the data variables and their corresponding time + variables from a CDF file and constructs an ObsPy Stream. + + Parameters: + - cdf: cdflib CDF object representing the open CDF file. + + Returns: + - An ObsPy Stream containing the data from the CDF file. + """ + stream = Stream() + # Read time variables + time_vars = {} + for var in cdf.cdf_info()['zVariables']: + if var.endswith('Time'): + time_data = cdf.varget(var) + # Convert TT2000 to UTCDateTime + utc_times = [UTCDateTime(t) for t in cdflib.cdfepoch.to_datetime(time_data)] + time_vars[var] = utc_times + + # Read data variables + for var in cdf.cdf_info()['zVariables']: + if not var.endswith('Time'): + data = cdf.varget(var) + attrs = cdf.varattsget(var) + if 'DEPEND_0' in attrs: + ts_name = attrs['DEPEND_0'] + if ts_name in time_vars: + times = time_vars[ts_name] + if len(times) > 1: + delta = times[1] - times[0] # Calculate sample interval + else: + delta = 60 if self.interval == 'minute' else 1 + trace = Trace( + data=data, + header={ + 'station': self.observatory, + 'channel': var, + 'starttime': times[0], + 'delta': delta, + } + ) + stream += trace + return stream + + @staticmethod + def getINTERMAGNETTermsOfUse() -> str: + """ + Return the INTERMAGNET Terms of Use. + + These terms should be included in the 'TermsOfUse' global attribute + as per the ImagCDF specification. + + Reference: + - ImagCDF Documentation Section 4.5: Attributes that Relate to Publication of the Data + """ + return ( + "CONDITIONS OF USE FOR DATA PROVIDED THROUGH INTERMAGNET:\n" + "The data made available through INTERMAGNET are provided for\n" + "your use and are not for commercial use or sale or distribution\n" + "to third parties without the written permission of the institute\n" + "(http://www.intermagnet.org/Institutes_e.html) operating\n" + "the observatory. Publications making use of the data\n" + "should include an acknowledgment statement of the form given below.\n" + "A citation reference should be sent to the INTERMAGNET Secretary\n" + "(secretary@intermagnet.org) for inclusion in a publications list\n" + "on the INTERMAGNET website.\n" + "\n" + " ACKNOWLEDGEMENT OF DATA FROM OBSERVATORIES\n" + " PARTICIPATING IN INTERMAGNET\n" + "We offer two acknowledgement templates. The first is for cases\n" + "where data from many observatories have been used and it is not\n" + "practical to list them all, or each of their operating institutes.\n" + "The second is for cases where research results have been produced\n" + "using a smaller set of observatories.\n" + "\n" + " Suggested Acknowledgement Text (template 1)\n" + "The results presented in this paper rely on data collected\n" + "at magnetic observatories. We thank the national institutes that\n" + "support them and INTERMAGNET for promoting high standards of\n" + "magnetic observatory practice (www.intermagnet.org).\n" + "\n" + " Suggested Acknowledgement Text (template 2)\n" + "The results presented in this paper rely on the data\n" + "collected at <observatory name>. We thank <institute name>,\n" + "for supporting its operation and INTERMAGNET for promoting high\n" + "standards of magnetic observatory practice (www.intermagnet.org).\n" + ) + + def _get_url( + self, + observatory: str, + date: UTCDateTime, + type: DataType = "variation", + interval: DataInterval = "minute", + channels: Optional[List[str]] = None, + ) -> str: + """ + Generate the file URL specific to ImagCDF conventions. + + This method constructs the filename based on the ImagCDF naming + conventions, which include the observatory code, date-time formatted + according to the data interval, and the publication level. + + Parameters: + - observatory: IAGA code of the observatory. + - date: Start date for the file. + - type: Data type indicating the processing level. + - interval: Data interval (e.g., 'minute', 'second'). + - channels: List of channels (optional). + + Returns: + - The formatted file URL or path. + + Reference: + - ImagCDF Documentation Section 5: ImagCDF File Names + """ + # Get the publication level for the type + publication_level = IMCDFPublicationLevel(data_type=type).to_string() + + # Determine filename date format based on interval + if interval == "year": + date_format = date.strftime("%Y") + elif interval == "month": + date_format = date.strftime("%Y%m") + elif interval == "day": + date_format = date.strftime("%Y%m%d") + elif interval == "hour": + date_format = date.strftime("%Y%m%d_%H") + elif interval == "minute": + date_format = date.strftime("%Y%m%d_%H%M") + elif interval == "second": + date_format = date.strftime("%Y%m%d_%H%M%S") + else: + raise ValueError(f"Unsupported interval: {interval}") + + # Default filename following ImagCDF convention + # Filename format: [iaga-code]_[date-time]_[publication-level].cdf + filename = f"{observatory.lower()}_{date_format}_{publication_level}.cdf" + + # If the urlTemplate explicitly specifies 'stdout', return 'stdout' + if self.urlTemplate.lower() == "stdout": + return "stdout" + + # Prepare parameters for templating + params = { + "date": date.datetime, + "i": self._get_interval_abbreviation(interval), + "interval": self._get_interval_name(interval), + "minute": date.hour * 60 + date.minute, + "month": date.strftime("%b").lower(), + "MONTH": date.strftime("%b").upper(), + "obs": observatory.lower(), + "OBS": observatory.upper(), + "t": publication_level, + "type": self._get_type_name(type), + "julian": date.strftime("%j"), + "year": date.strftime("%Y"), + "ymd": date.strftime("%Y%m%d"), + "dt": date_format, # Add the date-time formatted string + } + + # Attempt to use the template provided in urlTemplate + if "{" in self.urlTemplate and "}" in self.urlTemplate: + try: + return self.urlTemplate.format(**params) + except KeyError as e: + raise TimeseriesFactoryException(f"Invalid placeholder in urlTemplate: {e}") + + # If the urlTemplate doesn't support placeholders, assume 'file://' scheme + if self.urlTemplate.startswith("file://"): + base_path = self.urlTemplate[7:] # Strip "file://" + if not base_path or base_path == "{obs}_{dt}_{t}.cdf": + base_path = os.getcwd() # Default to current working directory + return os.path.join(base_path, filename) + + # Unsupported URL scheme + raise TimeseriesFactoryException( + f"Unsupported URL scheme in urlTemplate: {self.urlTemplate}" + ) + + # Placeholder methods for interval and type names/abbreviations + def _get_interval_abbreviation(self, interval: DataInterval) -> str: + """Get the abbreviation for the data interval.""" + abbreviations = { + "year": "yr", + "month": "mon", + "day": "day", + "hour": "hr", + "minute": "min", + "second": "sec", + } + return abbreviations.get(interval, "min") + + def _get_interval_name(self, interval: DataInterval) -> str: + """Get the full name for the data interval.""" + names = { + "year": "yearly", + "month": "monthly", + "day": "daily", + "hour": "hourly", + "minute": "minute", + "second": "second", + } + return names.get(interval, "minute") + + def _get_type_name(self, type: DataType) -> str: + """Get the full name for the data type.""" + type_names = { + "variation": "variation", + "definitive": "definitive", + "quasi-definitive": "quasi-definitive", + "provisional": "provisional", + "adjusted": "adjusted", + "none": "none", + } + return type_names.get(type, "variation") + + + def _get_vector_elements(self): + return {'X', 'Y', 'Z', 'H', 'D', 'E', 'V', 'I', 'F'} + + def _get_scalar_elements(self): + return {'S', 'G'} \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 3d7add0e..90b76110 100644 --- a/poetry.lock +++ b/poetry.lock @@ -153,6 +153,25 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "cdflib" +version = "1.3.2" +description = "A python CDF reader toolkit" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cdflib-1.3.2-py3-none-any.whl", hash = "sha256:49af97acc328c586ac5b7c27fd8e67bccf24af82c5bd8c37d8cfe048a0c1752a"}, + {file = "cdflib-1.3.2.tar.gz", hash = "sha256:97f27ac629e4c0ac1367eb8f4edd7a1d184190272ab98a6401e999f3a2e05687"}, +] + +[package.dependencies] +numpy = ">=1.21" + +[package.extras] +dev = ["ipython", "pre-commit"] +docs = ["astropy", "netcdf4", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-rtd-theme", "xarray"] +tests = ["astropy", "h5netcdf", "hypothesis", "netcdf4", "pytest (>=3.9)", "pytest-cov", "pytest-remotedata", "xarray"] + [[package]] name = "certifi" version = "2024.12.14" diff --git a/pyproject.toml b/pyproject.toml index c163d9b0..51a5ee31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ SQLAlchemy = "1.4.41" SQLAlchemy-Utc = "^0.14.0" uvicorn = {extras = ["standard"], version = "^0.22.0"} netcdf4 = "^1.7.2" +cdflib = "^1.3.2" [tool.poetry.dev-dependencies] -- GitLab From b94e4eeb111d2394126896622fc570990ec4e17f Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Wed, 4 Dec 2024 15:50:52 -0800 Subject: [PATCH 02/30] poc with working cdf write --- geomagio/ImagCDFFactory.py | 96 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 5d104acd..3189d15a 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -15,7 +15,7 @@ from io import BytesIO import os import sys from typing import List, Optional, Union - +from datetime import datetime, timezone import numpy as np from obspy import Stream, Trace, UTCDateTime @@ -121,74 +121,80 @@ class ImagCDFFactory(TimeseriesFactory): Note: Parsing from strings is not implemented in this factory. """ raise NotImplementedError('"parse_string" not implemented') - def write_file(self, fh, timeseries: Stream, channels: List[str]): - """Write the timeseries data to a file handle in ImagCDF format. - - Parameters: - - fh: File handle to write the data. - - timeseries: ObsPy Stream containing the geomagnetic data. - - channels: List of channels to include in the output file. - """ # Create a temporary file to write the CDF data with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_file_path = tmp_file.name + ".cdf" try: # Initialize the CDF writer - cdf_writer = cdflib.cdfwrite.CDF(tmp_file_path, cdf_spec=None) + cdf_writer = cdflib.cdfwrite.CDF(tmp_file_path) - # Write global attributes (metadata that applies to the entire file) + # Write global attributes global_attrs = self._create_global_attributes(timeseries, channels) cdf_writer.write_globalattrs(global_attrs) - # Write time variables for each channel + # Time variables time_vars = self._create_time_stamp_variables(timeseries) for ts_name, ts_data in time_vars.items(): + # Define time variable specification var_spec = { - "Variable": ts_name, - "Data_Type": 33, # CDF TT2000 data type - "Num_Elements": 1, - "Rec_Vary": True, - "Dim_Sizes": [], - "Var_Type": "zVariable", + 'Variable': ts_name, + 'Data_Type': 33, # CDF_TIME_TT2000 + 'Num_Elements': 1, + 'Rec_Vary': True, + 'Var_Type': 'zVariable', + 'Dim_Sizes': [], + 'Sparse': 'no_sparse', + 'Compress': 6, + 'Pad': None, } - print(f"Writing time variable {ts_name} with data length: {len(ts_data)}") - cdf_writer.write_var( - var_spec=var_spec, - var_attrs=self._create_time_var_attrs(ts_name), - var_data=ts_data, - ) + # Define time variable attributes + var_attrs = self._create_time_var_attrs(ts_name) + + # Write time variable + cdf_writer.write_var(var_spec, var_attrs, ts_data) - # Write geomagnetic data variables + # Data variables for trace in timeseries: channel = trace.stats.channel var_name = f"GeomagneticField{channel}" + data_type = self._get_cdf_data_type(trace) + num_elements = 1 + CDF_CHAR = 51 + CDF_UCHAR = 52 + if data_type in [CDF_CHAR, CDF_UCHAR]: # Handle string types + num_elements = len(trace.data[0]) if len(trace.data) > 0 else 1 + var_spec = { - "Variable": var_name, - "Data_Type": self._get_cdf_data_type(trace), - "Num_Elements": 1, - "Rec_Vary": True, - "Dim_Sizes": [], - "Var_Type": "zVariable", + 'Variable': var_name, + 'Data_Type': data_type, + 'Num_Elements': num_elements, + 'Rec_Vary': True, + 'Var_Type': 'zVariable', + 'Dim_Sizes': [], + 'Sparse': 'no_sparse', + 'Compress': 6, + 'Pad': None, } - print(f"Writing data variable {var_name} with data shape: {trace.data.shape}") - cdf_writer.write_var( - var_spec=var_spec, - var_attrs=self._create_var_attrs(trace), - var_data=trace.data, - ) + + var_attrs = self._create_var_attrs(trace) + + # Write data variable + cdf_writer.write_var(var_spec, var_attrs, trace.data) + + # Close the CDF writer + cdf_writer.close() # Copy the temporary CDF file to the final file handle with open(tmp_file_path, "rb") as tmp: cdf_data = tmp.read() fh.write(cdf_data) - cdf_writer.close() - finally: # Cleanup the temporary file - print(tmp_file_path) + os.remove(tmp_file_path) + def put_timeseries( self, @@ -336,11 +342,11 @@ class ImagCDFFactory(TimeseriesFactory): 'IagaCode': {0: station}, 'ElementsRecorded': {0: ''.join(channels)}, 'PublicationLevel': {0: publication_level}, - 'PublicationDate': {0: UTCDateTime.now().strftime("%Y-%m-%dT%H:%M:%SZ")}, + 'PublicationDate': {0: [cdflib.cdfepoch.timestamp_to_tt2000(datetime.timestamp(datetime.now(timezone.utc))), "cdf_time_tt2000"]}, 'ObservatoryName': {0: observatory_name}, - 'Latitude': {0: latitude}, - 'Longitude': {0: longitude}, - 'Elevation': {0: elevation}, + 'Latitude': {0: np.array([latitude], dtype=np.float64)}, + 'Longitude': {0: np.array([longitude], dtype=np.float64)}, + 'Elevation': {0: np.array([elevation], dtype=np.float64)}, 'Institution': {0: institution}, 'VectorSensOrient': {0: vector_orientation}, #remove F - because its a calculation, not an element? 'StandardLevel': {0: 'None'}, # Set to 'None' @@ -436,7 +442,7 @@ class ImagCDFFactory(TimeseriesFactory): return var_spec def _create_var_attrs(self, trace: Trace) -> dict: - print(trace.stats) + # print(trace.stats) channel = trace.stats.channel fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†-- GitLab From dc9cc949873e649788e69a21dd835b8888b4fdce Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 5 Dec 2024 11:25:57 -0800 Subject: [PATCH 03/30] minor adjustments for spec. temperature channel compatibility. --- geomagio/Controller.py | 5 ++ geomagio/ImagCDFFactory.py | 112 +++++++++++++++---------------------- geomagio/api/ws/Element.py | 3 + 3 files changed, 53 insertions(+), 67 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 564b54c9..40e6b173 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, Union from obspy.core import Stream, UTCDateTime +from geomagio.ImagCDFFactory import ImagCDFFactory from .algorithm import Algorithm, algorithms, AlgorithmException, FilterAlgorithm from .DerivedTimeseriesFactory import DerivedTimeseriesFactory @@ -27,6 +28,7 @@ from . import vbf from . import xml from . import covjson from . import netcdf +from geomagio.ImagCDFFactory import ImagCDFFactory class Controller(object): @@ -630,6 +632,8 @@ def get_output_factory(args): ) elif output_type == "plot": output_factory = PlotTimeseriesFactory() + elif output_type == "imagcdf": + output_factory = ImagCDFFactory() else: # stream compatible factories if output_type == "binlog": @@ -1040,6 +1044,7 @@ def parse_args(args): "xml", "covjson", "netcdf", + "imagcdf", ), # TODO: set default to 'iaga2002' help="Output format", diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 3189d15a..9644c23e 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -6,8 +6,9 @@ The ImagCDF format is based on NASA's Common Data Format (CDF) and is designed to store geomagnetic data with high precision. References: -- ImagCDF Format Documentation: https://intermagnet.org/docs/technical/im_tn_8_ImagCDF.pdf -- CDF Library: http://cdf.gsfc.nasa.gov/ +- ImagCDF Technical Note: https://intermagnet.org/docs/technical/im_tn_8_ImagCDF.pdf +- ImagCDF Official Documentation: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#imagcdfv1-2-intermagnet-exchange-format +- CDF: http://cdf.gsfc.nasa.gov/ """ from __future__ import absolute_import, print_function @@ -20,6 +21,7 @@ import numpy as np from obspy import Stream, Trace, UTCDateTime from geomagio.TimeseriesFactory import TimeseriesFactory +from geomagio.api.ws.Element import TEMPERATURE_ELEMENTS_ID from .geomag_types import DataInterval, DataType from .TimeseriesFactoryException import TimeseriesFactoryException @@ -121,6 +123,7 @@ class ImagCDFFactory(TimeseriesFactory): Note: Parsing from strings is not implemented in this factory. """ raise NotImplementedError('"parse_string" not implemented') + def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data with tempfile.NamedTemporaryFile(delete=False) as tmp_file: @@ -154,18 +157,24 @@ class ImagCDFFactory(TimeseriesFactory): # Write time variable cdf_writer.write_var(var_spec, var_attrs, ts_data) - + + # Data variables + temperature_index = 0 for trace in timeseries: channel = trace.stats.channel - var_name = f"GeomagneticField{channel}" + if channel in TEMPERATURE_ELEMENTS_ID: + temperature_index += 1 #MUST INCREMENT INDEX BEFORE USING + var_name = f"Temperature{temperature_index}" + else: + var_name = f"GeomagneticField{channel}" data_type = self._get_cdf_data_type(trace) num_elements = 1 CDF_CHAR = 51 CDF_UCHAR = 52 if data_type in [CDF_CHAR, CDF_UCHAR]: # Handle string types num_elements = len(trace.data[0]) if len(trace.data) > 0 else 1 - + var_spec = { 'Variable': var_name, 'Data_Type': data_type, @@ -178,7 +187,7 @@ class ImagCDFFactory(TimeseriesFactory): 'Pad': None, } - var_attrs = self._create_var_attrs(trace) + var_attrs = self._create_var_attrs(trace, temperature_index) # Write data variable cdf_writer.write_var(var_spec, var_attrs, trace.data) @@ -362,7 +371,8 @@ class ImagCDFFactory(TimeseriesFactory): def _create_time_stamp_variables(self, timeseries: Stream) -> dict: vector_times = None scalar_times = None - + temperature_times = {} + temperature_index = 1 for trace in timeseries: channel = trace.stats.channel times = [ @@ -371,7 +381,6 @@ class ImagCDFFactory(TimeseriesFactory): ] # Convert timestamps to TT2000 format required by CDF tt2000_times = cdflib.cdfepoch.timestamp_to_tt2000([time.timestamp() for time in times]) - # tt2000_times = cdflib.cdfepoch.compute_tt2000(times) #this does not work if channel in self._get_vector_elements(): if vector_times is None: @@ -385,16 +394,21 @@ class ImagCDFFactory(TimeseriesFactory): else: if not np.array_equal(scalar_times, tt2000_times): raise ValueError("Time stamps for scalar channels are not the same.") - else: - # Handle other channels if necessary - pass + elif channel in TEMPERATURE_ELEMENTS_ID: + ts_key = f"Temperature{temperature_index}Times" + if ts_key not in temperature_times: + temperature_times[ts_key] = tt2000_times + temperature_index += 1 + else: + temperature_times[ts_key] = tt2000_times time_vars = {} if vector_times is not None: time_vars['GeomagneticVectorTimes'] = vector_times if scalar_times is not None: time_vars['GeomagneticScalarTimes'] = scalar_times - + if temperature_times: + time_vars.update(temperature_times) return time_vars @@ -441,8 +455,7 @@ class ImagCDFFactory(TimeseriesFactory): } return var_spec - def _create_var_attrs(self, trace: Trace) -> dict: - # print(trace.stats) + def _create_var_attrs(self, trace: Trace, temperature_index: Optional[int] = None) -> dict: channel = trace.stats.channel fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†@@ -454,10 +467,12 @@ class ImagCDFFactory(TimeseriesFactory): units = 'Degrees of arc' validmin = -90.0 validmax = 90.0 #The magnetic field vector can point straight down (+90°), horizontal (0°), or straight up (-90°). - elif 'Temperature' in channel: + elif channel in TEMPERATURE_ELEMENTS_ID: units = 'Celsius' - fieldnam = f"Temperature {trace.stats.location}" - elif channel == 'F': + fieldnam = f"Temperature {temperature_index} {trace.stats.location}" + validmin = -273.15 + validmax = 79_999 + elif channel in ['F','S']: units = 'nT' validmin = 0.0 # negative magnetic field intestity not physically meaningful. validmax = 79_999.0 @@ -466,12 +481,13 @@ class ImagCDFFactory(TimeseriesFactory): validmin = -79_999.0 validmax = 79_999.0 + # Determine DEPEND_0 based on channel type if channel in self._get_vector_elements(): depend_0 = 'GeomagneticVectorTimes' elif channel in self._get_scalar_elements(): depend_0 = 'GeomagneticScalarTimes' - else: - depend_0 = None # Handle other cases if necessary + elif channel in TEMPERATURE_ELEMENTS_ID: + depend_0 = f"Temperature{temperature_index}Times" var_attrs = { @@ -486,7 +502,6 @@ class ImagCDFFactory(TimeseriesFactory): } return var_attrs - def _create_time_var_attrs(self, ts_name: str) -> dict: """ Create a dictionary of time variable attributes. @@ -546,7 +561,7 @@ class ImagCDFFactory(TimeseriesFactory): stream = Stream() # Read time variables time_vars = {} - for var in cdf.cdf_info()['zVariables']: + for var in cdf.cdf_info().zVariables: if var.endswith('Time'): time_data = cdf.varget(var) # Convert TT2000 to UTCDateTime @@ -554,7 +569,7 @@ class ImagCDFFactory(TimeseriesFactory): time_vars[var] = utc_times # Read data variables - for var in cdf.cdf_info()['zVariables']: + for var in cdf.cdf_info().zVariables: if not var.endswith('Time'): data = cdf.varget(var) attrs = cdf.varattsget(var) @@ -635,14 +650,16 @@ class ImagCDFFactory(TimeseriesFactory): This method constructs the filename based on the ImagCDF naming conventions, which include the observatory code, date-time formatted - according to the data interval, and the publication level. + according to the data interval, and the publication level. + + [iaga-code]_[date-time]_[publication-level].cdf Parameters: - observatory: IAGA code of the observatory. - date: Start date for the file. - type: Data type indicating the processing level. - interval: Data interval (e.g., 'minute', 'second'). - - channels: List of channels (optional). + - channels: List of channels (optional). Not Implemented Returns: - The formatted file URL or path. @@ -653,7 +670,7 @@ class ImagCDFFactory(TimeseriesFactory): # Get the publication level for the type publication_level = IMCDFPublicationLevel(data_type=type).to_string() - # Determine filename date format based on interval + # Format of Date/Time Portion of Filename based on interval see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#example-data-file:~:text=Format%20of%20Date,%EF%83%81 if interval == "year": date_format = date.strftime("%Y") elif interval == "month": @@ -667,10 +684,9 @@ class ImagCDFFactory(TimeseriesFactory): elif interval == "second": date_format = date.strftime("%Y%m%d_%H%M%S") else: - raise ValueError(f"Unsupported interval: {interval}") + raise ValueError(f"Unsupported interval: {interval}") #tenhertz currently not supported - # Default filename following ImagCDF convention - # Filename format: [iaga-code]_[date-time]_[publication-level].cdf + # Filename following ImagCDF convention, see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#imagcdf-file-names filename = f"{observatory.lower()}_{date_format}_{publication_level}.cdf" # If the urlTemplate explicitly specifies 'stdout', return 'stdout' @@ -696,7 +712,7 @@ class ImagCDFFactory(TimeseriesFactory): } # Attempt to use the template provided in urlTemplate - if "{" in self.urlTemplate and "}" in self.urlTemplate: + if "{" in self.urlTemplate and "}" in self.urlTemplate: try: return self.urlTemplate.format(**params) except KeyError as e: @@ -714,44 +730,6 @@ class ImagCDFFactory(TimeseriesFactory): f"Unsupported URL scheme in urlTemplate: {self.urlTemplate}" ) - # Placeholder methods for interval and type names/abbreviations - def _get_interval_abbreviation(self, interval: DataInterval) -> str: - """Get the abbreviation for the data interval.""" - abbreviations = { - "year": "yr", - "month": "mon", - "day": "day", - "hour": "hr", - "minute": "min", - "second": "sec", - } - return abbreviations.get(interval, "min") - - def _get_interval_name(self, interval: DataInterval) -> str: - """Get the full name for the data interval.""" - names = { - "year": "yearly", - "month": "monthly", - "day": "daily", - "hour": "hourly", - "minute": "minute", - "second": "second", - } - return names.get(interval, "minute") - - def _get_type_name(self, type: DataType) -> str: - """Get the full name for the data type.""" - type_names = { - "variation": "variation", - "definitive": "definitive", - "quasi-definitive": "quasi-definitive", - "provisional": "provisional", - "adjusted": "adjusted", - "none": "none", - } - return type_names.get(type, "variation") - - def _get_vector_elements(self): return {'X', 'Y', 'Z', 'H', 'D', 'E', 'V', 'I', 'F'} diff --git a/geomagio/api/ws/Element.py b/geomagio/api/ws/Element.py index ae5c4ead..25b876c5 100644 --- a/geomagio/api/ws/Element.py +++ b/geomagio/api/ws/Element.py @@ -45,3 +45,6 @@ ELEMENTS = [ ] ELEMENT_INDEX = {e.id: e for e in ELEMENTS} + + +TEMPERATURE_ELEMENTS_ID = [element.id for element in ELEMENTS if "Temperature" in element.name] \ No newline at end of file -- GitLab From 41e88066399371437ef80b805a7e4b3e8ce4bc74 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 5 Dec 2024 13:05:31 -0800 Subject: [PATCH 04/30] use DataTimes variable name on non-unique time sets --- geomagio/ImagCDFFactory.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 9644c23e..157ad0cf 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -19,6 +19,7 @@ from typing import List, Optional, Union from datetime import datetime, timezone import numpy as np from obspy import Stream, Trace, UTCDateTime +from sqlalchemy import true from geomagio.TimeseriesFactory import TimeseriesFactory from geomagio.api.ws.Element import TEMPERATURE_ELEMENTS_ID @@ -87,6 +88,8 @@ class ImagCDFFactory(TimeseriesFactory): This class extends the TimeseriesFactory to support writing geomagnetic time series data to files in the ImagCDF format using the cdflib library. """ + + isUniqueTimes=True #used to determine depend_0 and CDF Time Variable Name def __init__( self, @@ -138,7 +141,7 @@ class ImagCDFFactory(TimeseriesFactory): cdf_writer.write_globalattrs(global_attrs) # Time variables - time_vars = self._create_time_stamp_variables(timeseries) + time_vars = self._create_time_stamp_variables(timeseries) #modifies self.isUniqueTimes for ts_name, ts_data in time_vars.items(): # Define time variable specification var_spec = { @@ -186,8 +189,7 @@ class ImagCDFFactory(TimeseriesFactory): 'Compress': 6, 'Pad': None, } - - var_attrs = self._create_var_attrs(trace, temperature_index) + var_attrs = self._create_var_attrs(trace, temperature_index, self.isUniqueTimes) # Write data variable cdf_writer.write_var(var_spec, var_attrs, trace.data) @@ -409,7 +411,15 @@ class ImagCDFFactory(TimeseriesFactory): time_vars['GeomagneticScalarTimes'] = scalar_times if temperature_times: time_vars.update(temperature_times) - return time_vars + + last_times = [] + self.isUniqueTimes = len(time_vars) == 1 #true if only one set of times, else default to false. + for index, times in enumerate(time_vars.values()): + if index > 0: + self.isUniqueTimes = not np.array_equal(last_times, times) + last_times = times + + return time_vars if self.isUniqueTimes else {"DataTimes": last_times} def _create_var_spec( @@ -455,7 +465,7 @@ class ImagCDFFactory(TimeseriesFactory): } return var_spec - def _create_var_attrs(self, trace: Trace, temperature_index: Optional[int] = None) -> dict: + def _create_var_attrs(self, trace: Trace, temperature_index: Optional[int] = None, isUniqueTimes: Optional[bool] = True) -> dict: channel = trace.stats.channel fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†@@ -496,7 +506,7 @@ class ImagCDFFactory(TimeseriesFactory): 'FILLVAL': 99999.0, 'VALIDMIN': validmin, 'VALIDMAX': validmax, - 'DEPEND_0': depend_0, + 'DEPEND_0': depend_0 if isUniqueTimes else "DataTimes", 'DISPLAY_TYPE': 'time_series', 'LABLAXIS': channel, } -- GitLab From 68798d807806a6d681af5b93d770fd4e23e9ce92 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 5 Dec 2024 15:51:48 -0800 Subject: [PATCH 05/30] with compression at the file level --- geomagio/ImagCDFFactory.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 157ad0cf..0acf6ece 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -129,12 +129,20 @@ class ImagCDFFactory(TimeseriesFactory): def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data - with tempfile.NamedTemporaryFile(delete=False) as tmp_file: - tmp_file_path = tmp_file.name + ".cdf" + with tempfile.NamedTemporaryFile(delete=False, suffix='.cdf') as tmp_file: + tmp_file_path = tmp_file.name try: # Initialize the CDF writer - cdf_writer = cdflib.cdfwrite.CDF(tmp_file_path) + cdf_spec = { + 'Compressed': 9, # Enable compression (0-9) + # 'Majority': 'row_major', # Data layout - gets set automatically + # 'Encoding': 'ibmpc', # Corrupts CDF - gets set automatically + 'Checksum': True, # Disable checksum for faster writes (optional) + 'rDim_sizes': [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. + } + + cdf_writer = cdflib.cdfwrite.CDF(path=tmp_file_path, cdf_spec=cdf_spec, delete=True) # Write global attributes global_attrs = self._create_global_attributes(timeseries, channels) @@ -206,7 +214,6 @@ class ImagCDFFactory(TimeseriesFactory): # Cleanup the temporary file os.remove(tmp_file_path) - def put_timeseries( self, timeseries: Stream, @@ -288,7 +295,7 @@ class ImagCDFFactory(TimeseriesFactory): # Read existing data to merge with new data existing_cdf = cdflib.cdfread.CDF(url_file) existing_stream = self._read_cdf(existing_cdf) - existing_cdf.close() + # existing_cdf.close() #no close method? existing_data = existing_stream # Merge existing data with new data @@ -480,7 +487,7 @@ class ImagCDFFactory(TimeseriesFactory): elif channel in TEMPERATURE_ELEMENTS_ID: units = 'Celsius' fieldnam = f"Temperature {temperature_index} {trace.stats.location}" - validmin = -273.15 + validmin = -273.15 #absolute zero validmax = 79_999 elif channel in ['F','S']: units = 'nT' -- GitLab From a0403695353a53a9be004df2c4b2e6ef1f63424e Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 6 Dec 2024 15:43:01 -0800 Subject: [PATCH 06/30] poc for reading cdf into obspy stream. working for imag to iaga --- geomagio/Controller.py | 3 + geomagio/ImagCDFFactory.py | 296 +++++++++++++++++++++++++++---------- 2 files changed, 219 insertions(+), 80 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 40e6b173..96ebb28c 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -547,6 +547,8 @@ def get_input_factory(args): locationCode=args.locationcode, **input_factory_args, ) + elif input_type == "imagcdf": + input_factory = ImagCDFFactory(**input_factory_args) else: # stream compatible factories if input_type == "iaga2002": @@ -837,6 +839,7 @@ def parse_args(args): "xml", "covjson", "netcdf", + "imagcdf" ), default="edge", help='Input format (Default "edge")', diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 0acf6ece..6ba44f3e 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -8,7 +8,8 @@ to store geomagnetic data with high precision. References: - ImagCDF Technical Note: https://intermagnet.org/docs/technical/im_tn_8_ImagCDF.pdf - ImagCDF Official Documentation: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#imagcdfv1-2-intermagnet-exchange-format -- CDF: http://cdf.gsfc.nasa.gov/ +- CDF User Guide: https://spdf.gsfc.nasa.gov/pub/software/cdf/doc/cdf_User_Guide.pdf +- CDFLib Docs: [https://cdflib.readthedocs.io/en/latest/#, https://cdflib.readthedocs.io/en/stable/modules/index.html] """ from __future__ import absolute_import, print_function @@ -16,7 +17,7 @@ from io import BytesIO import os import sys from typing import List, Optional, Union -from datetime import datetime, timezone +from datetime import datetime, timezone, tzinfo import numpy as np from obspy import Stream, Trace, UTCDateTime from sqlalchemy import true @@ -330,6 +331,84 @@ class ImagCDFFactory(TimeseriesFactory): # Unsupported URL scheme encountered raise TimeseriesFactoryException("Unsupported URL scheme in urlTemplate") + def get_timeseries( + self, + starttime: UTCDateTime, + endtime: UTCDateTime, + add_empty_channels: bool = True, + observatory: Optional[str] = None, + channels: Optional[List[str]] = None, + type: Optional[DataType] = None, + interval: Optional[DataInterval] = None, + ) -> Stream: + observatory = observatory or self.observatory + channels = channels or self.channels + type = type or self.type + interval = interval or self.interval + + timeseries = Stream() + urlIntervals = Util.get_intervals( + starttime=starttime, endtime=endtime, size=self.urlInterval + ) + + for urlInterval in urlIntervals: + url = self._get_url( + observatory=observatory, + date=urlInterval["start"], + type=type, + interval=interval, + channels=channels, + ) + if url == 'stdout': + continue # stdout is not a valid input source + if not url.startswith("file://"): + raise TimeseriesFactoryException("Only file urls are supported for reading ImagCDF") + + url_file = Util.get_file_from_url(url, createParentDirectory=False) + if not os.path.isfile(url_file): + # If file doesn't exist, skip + continue + + try: + # Read CDF data and merge + cdf = cdflib.cdfread.CDF(url_file) + file_stream = self._read_cdf(cdf) + # Attempt to select only requested channels + selected = Stream() + for ch in channels: + selected += file_stream.select(channel=ch) + timeseries += selected + except Exception as e: + print(f"Error reading CDF file '{url_file}': {e}", file=sys.stderr) + + # After reading all intervals, merge and trim + timeseries.merge(fill_value=np.nan) + timeseries.trim( + starttime=starttime, + endtime=endtime, + nearest_sample=False, + pad=True, + fill_value=np.nan, + ) + + # If requested, add empty channels not present in data + if add_empty_channels: + present_channels = {tr.stats.channel for tr in timeseries} + missing_channels = set(channels) - present_channels + for ch in missing_channels: + empty_trace = self._get_empty_trace( + starttime=starttime, + endtime=endtime, + observatory=observatory, + channel=ch, + data_type=type, + interval=interval, + ) + timeseries += empty_trace + + timeseries.sort() + return timeseries + def _create_global_attributes(self, timeseries: Stream, channels: List[str]) -> dict: """ Create a dictionary of global attributes for the ImagCDF file. @@ -344,14 +423,19 @@ class ImagCDFFactory(TimeseriesFactory): stats = timeseries[0].stats if len(timeseries) > 0 else None # Extract metadata from stats or fallback to defaults - observatory_name = getattr(stats, 'station_name', None) or self.observatory or "Unknown Observatory" - station = getattr(stats, 'station', None) or "Unknown Iaga Code" - institution = getattr(stats, 'agency_name', None) or "Unknown Institution" + observatory_name = getattr(stats, 'station_name', None) or self.observatory or "" + station = getattr(stats, 'station', None) or "" + institution = getattr(stats, 'agency_name', None) or "" latitude = getattr(stats, 'geodetic_latitude', None) or 0.0 longitude = getattr(stats, 'geodetic_longitude', None) or 0.0 elevation = getattr(stats, 'elevation', None) or 99_999.0 + conditions_of_use = getattr(stats, 'conditions_of_use', None) or "" vector_orientation = getattr(stats, 'sensor_orientation', None) or "" data_interval_type = getattr(stats, 'data_interval_type', None) or self.interval + data_type = getattr(stats, 'data_type', None) or "variation" + sensor_sampling_rate = getattr(stats, 'sensor_sampling_rate', None) or 0.0 + comments = getattr(stats, 'filter_comments', None) or [''] + declination_base = getattr(stats, 'declination_base', None) or 0.0 publication_level = IMCDFPublicationLevel(data_type=self.type).to_string() global_attrs = { 'FormatDescription': {0: 'INTERMAGNET CDF Format'}, @@ -370,10 +454,14 @@ class ImagCDFFactory(TimeseriesFactory): 'StandardLevel': {0: 'None'}, # Set to 'None' # Temporarily Omit 'StandardName', 'StandardVersion', 'PartialStandDesc' 'Source': {0: 'institute'}, # "institute" - if the named institution provided the data, “INTERMAGNET†- if the data file has been created by INTERMAGNET from another data source, “WDC†- if the World Data Centre has created the file from another data source - # 'TermsOfUse': {0: self.getINTERMAGNETTermsOfUse()}, + 'TermsOfUse': {0: conditions_of_use}, # 'UniqueIdentifier': {0: ''}, # 'ParentIdentifiers': {0: ''}, # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov + 'SensorSamplingRate': {0: sensor_sampling_rate}, + 'DataType': {0: data_type}, + 'Comments': {0: comments}, + 'DeclinationBase': {0: declination_base}, } return global_attrs @@ -382,10 +470,12 @@ class ImagCDFFactory(TimeseriesFactory): scalar_times = None temperature_times = {} temperature_index = 1 + print(timeseries[0].stats) + for trace in timeseries: channel = trace.stats.channel times = [ - (trace.stats.starttime + trace.stats.delta * i).datetime + (trace.stats.starttime + trace.stats.delta * i).datetime.replace(tzinfo=timezone.utc) for i in range(trace.stats.npts) ] # Convert timestamps to TT2000 format required by CDF @@ -473,7 +563,7 @@ class ImagCDFFactory(TimeseriesFactory): return var_spec def _create_var_attrs(self, trace: Trace, temperature_index: Optional[int] = None, isUniqueTimes: Optional[bool] = True) -> dict: - channel = trace.stats.channel + channel = trace.stats.channel.upper() fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†if channel == 'D': @@ -516,6 +606,7 @@ class ImagCDFFactory(TimeseriesFactory): 'DEPEND_0': depend_0 if isUniqueTimes else "DataTimes", 'DISPLAY_TYPE': 'time_series', 'LABLAXIS': channel, + 'DATA_INTERVAL_TYPE': trace.stats.data_interval_type } return var_attrs @@ -576,83 +667,126 @@ class ImagCDFFactory(TimeseriesFactory): - An ObsPy Stream containing the data from the CDF file. """ stream = Stream() - # Read time variables + + # Extract global attributes + global_attrs = cdf.globalattsget() + + # Map global attributes to Stream-level metadata + observatory = global_attrs.get('IagaCode', [''])[0] + station_name = global_attrs.get('ObservatoryName', [''])[0] + institution = global_attrs.get('Institution', [''])[0] + latitude = global_attrs.get('Latitude', [0.0])[0] + longitude = global_attrs.get('Longitude', [0.0])[0] + elevation = global_attrs.get('Elevation', [99_999.0])[0] + sensor_sampling_rate = global_attrs.get('SensorSamplingRate', [0.0])[0] + sensor_orientation = global_attrs.get('VectorSensOrient', [''])[0] + data_type = global_attrs.get('DataType', ['variation'])[0] + publication_level = global_attrs.get('PublicationLevel', ['1'])[0] + comments = global_attrs.get('Comments', ['']) + terms_of_use = global_attrs.get('TermsOfUse', [''])[0] + declination_base = global_attrs.get('DeclinationBase', [0.0])[0] + + # Identify all time variables time_vars = {} for var in cdf.cdf_info().zVariables: - if var.endswith('Time'): + if var.lower().endswith('times'): time_data = cdf.varget(var) - # Convert TT2000 to UTCDateTime - utc_times = [UTCDateTime(t) for t in cdflib.cdfepoch.to_datetime(time_data)] + unix_times = cdflib.cdfepoch.unixtime(time_data) + utc_times = [UTCDateTime(t) for t in unix_times] time_vars[var] = utc_times - # Read data variables + # Read data variables and associate them with time variables for var in cdf.cdf_info().zVariables: - if not var.endswith('Time'): - data = cdf.varget(var) - attrs = cdf.varattsget(var) - if 'DEPEND_0' in attrs: - ts_name = attrs['DEPEND_0'] - if ts_name in time_vars: - times = time_vars[ts_name] - if len(times) > 1: - delta = times[1] - times[0] # Calculate sample interval - else: - delta = 60 if self.interval == 'minute' else 1 - trace = Trace( - data=data, - header={ - 'station': self.observatory, - 'channel': var, - 'starttime': times[0], - 'delta': delta, - } - ) - stream += trace - return stream - - @staticmethod - def getINTERMAGNETTermsOfUse() -> str: - """ - Return the INTERMAGNET Terms of Use. - - These terms should be included in the 'TermsOfUse' global attribute - as per the ImagCDF specification. + # Skip time variables + if var.lower().endswith('times'): + continue + + data = cdf.varget(var) + attrs = cdf.varattsget(var) + + # Determine DEPEND_0 (the time variable name) and validate + ts_name = attrs.get('DEPEND_0') + if not ts_name: + # If no DEPEND_0, skip this variable as we cannot map times + continue + + # If we used a DataTimes fallback or similar, ensure we handle it case-insensitively + # and also confirm that time_vars keys are checked properly. + # The ImagCDF can have DataTimes, GeomagneticVectorTimes, etc. + # So try exact match first, if not found, attempt case-insensitive. + matched_time_key = None + for tkey in time_vars.keys(): + if tkey == ts_name: + matched_time_key = tkey + break + if tkey.lower() == ts_name.lower(): + matched_time_key = tkey + break + + if matched_time_key not in time_vars: + # If we cannot find the matching time variable, skip this variable + continue + + times = time_vars[matched_time_key] + + # Determine delta (sample interval) + if len(times) > 1: + # delta as a float of seconds + delta = (times[1].timestamp - times[0].timestamp) + else: + # if only one sample, use default based on interval + # convert minute, second, etc. to numeric delta + if self.interval == 'minute': + delta = 60.0 + elif self.interval == 'second': + delta = 1.0 + elif self.interval == 'hour': + delta = 3600.0 + else: + # fallback, set delta to 60 + delta = 60.0 + + # Map the variable name back to a standard channel code + # Geomagnetic fields are named like GeomagneticFieldH, GeomagneticFieldD, etc. + # Temperatures are named like Temperature1, Temperature2, ... + # Extract channel name by removing known prefixes + if var.startswith("GeomagneticField"): + channel = var.replace("GeomagneticField", "") + elif var.startswith("Temperature"): + # Temperature variables may not map directly to a geomagnetic channel + # but to temperature sensors. We can just use the label from LABLAXIS if needed + channel = attrs.get('LABLAXIS', var) + else: + # fallback if naming doesn't match expected patterns + channel = var + + time_attrs =cdf.varattsget(var) + data_interval = time_attrs.get('DATA_INTERVAL_TYPE', ['']) + # Create a trace + trace = Trace( + data=data, + header={ + 'station': observatory, + 'channel': channel, + 'starttime': times[0], + 'delta': delta, + 'geodetic_latitude': latitude, + 'geodetic_longitude': longitude, + 'elevation': elevation, + 'sensor_orientation': "".join(sensor_orientation), + 'data_type': data_type, + 'station_name': station_name, + 'agency_name': institution, + 'conditions_of_use': terms_of_use, + 'sensor_sampling_rate': sensor_sampling_rate, + 'data_interval_type': data_interval, + 'declination_base': declination_base, + 'filter_comments': comments + } + ) + stream += trace - Reference: - - ImagCDF Documentation Section 4.5: Attributes that Relate to Publication of the Data - """ - return ( - "CONDITIONS OF USE FOR DATA PROVIDED THROUGH INTERMAGNET:\n" - "The data made available through INTERMAGNET are provided for\n" - "your use and are not for commercial use or sale or distribution\n" - "to third parties without the written permission of the institute\n" - "(http://www.intermagnet.org/Institutes_e.html) operating\n" - "the observatory. Publications making use of the data\n" - "should include an acknowledgment statement of the form given below.\n" - "A citation reference should be sent to the INTERMAGNET Secretary\n" - "(secretary@intermagnet.org) for inclusion in a publications list\n" - "on the INTERMAGNET website.\n" - "\n" - " ACKNOWLEDGEMENT OF DATA FROM OBSERVATORIES\n" - " PARTICIPATING IN INTERMAGNET\n" - "We offer two acknowledgement templates. The first is for cases\n" - "where data from many observatories have been used and it is not\n" - "practical to list them all, or each of their operating institutes.\n" - "The second is for cases where research results have been produced\n" - "using a smaller set of observatories.\n" - "\n" - " Suggested Acknowledgement Text (template 1)\n" - "The results presented in this paper rely on data collected\n" - "at magnetic observatories. We thank the national institutes that\n" - "support them and INTERMAGNET for promoting high standards of\n" - "magnetic observatory practice (www.intermagnet.org).\n" - "\n" - " Suggested Acknowledgement Text (template 2)\n" - "The results presented in this paper rely on the data\n" - "collected at <observatory name>. We thank <institute name>,\n" - "for supporting its operation and INTERMAGNET for promoting high\n" - "standards of magnetic observatory practice (www.intermagnet.org).\n" - ) + return stream def _get_url( self, @@ -725,7 +859,7 @@ class ImagCDFFactory(TimeseriesFactory): "julian": date.strftime("%j"), "year": date.strftime("%Y"), "ymd": date.strftime("%Y%m%d"), - "dt": date_format, # Add the date-time formatted string + "dt": date_format, } # Attempt to use the template provided in urlTemplate @@ -751,4 +885,6 @@ class ImagCDFFactory(TimeseriesFactory): return {'X', 'Y', 'Z', 'H', 'D', 'E', 'V', 'I', 'F'} def _get_scalar_elements(self): - return {'S', 'G'} \ No newline at end of file + return {'S', 'G'} + + -- GitLab From ee606ebb6445d443f368c12d3b931893ecbc7d16 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 9 Dec 2024 13:10:09 -0800 Subject: [PATCH 07/30] general clean up --- geomagio/ImagCDFFactory.py | 56 ++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 6ba44f3e..3d8ba136 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -31,6 +31,8 @@ from . import TimeseriesUtility from . import Util import cdflib +from cdflib.cdfwrite import CDF as CDFWriter +from cdflib.cdfread import CDF as CDFReader import tempfile @@ -48,7 +50,7 @@ class IMCDFPublicationLevel: 4: Definitive data with no further changes expected. Reference: - - ImagCDF Documentation Section 4.2: Attributes that Uniquely Identify the Data + - ImagCDF Technical Documentation: Attributes that Uniquely Identify the Data """ class PublicationLevel: @@ -137,13 +139,13 @@ class ImagCDFFactory(TimeseriesFactory): # Initialize the CDF writer cdf_spec = { 'Compressed': 9, # Enable compression (0-9) - # 'Majority': 'row_major', # Data layout - gets set automatically - # 'Encoding': 'ibmpc', # Corrupts CDF - gets set automatically + 'Majority': CDFWriter.ROW_MAJOR, # Data layout - gets set automatically + 'Encoding': CDFWriter.IBMPC_ENCODING, # gets set automatically 'Checksum': True, # Disable checksum for faster writes (optional) 'rDim_sizes': [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } - cdf_writer = cdflib.cdfwrite.CDF(path=tmp_file_path, cdf_spec=cdf_spec, delete=True) + cdf_writer = CDFWriter(path=tmp_file_path, cdf_spec=cdf_spec, delete=True) # Write global attributes global_attrs = self._create_global_attributes(timeseries, channels) @@ -155,13 +157,13 @@ class ImagCDFFactory(TimeseriesFactory): # Define time variable specification var_spec = { 'Variable': ts_name, - 'Data_Type': 33, # CDF_TIME_TT2000 + 'Data_Type': CDFWriter.CDF_TIME_TT2000, # CDF_TIME_TT2000 'Num_Elements': 1, 'Rec_Vary': True, 'Var_Type': 'zVariable', 'Dim_Sizes': [], 'Sparse': 'no_sparse', - 'Compress': 6, + 'Compress': 9, 'Pad': None, } # Define time variable attributes @@ -182,9 +184,7 @@ class ImagCDFFactory(TimeseriesFactory): var_name = f"GeomagneticField{channel}" data_type = self._get_cdf_data_type(trace) num_elements = 1 - CDF_CHAR = 51 - CDF_UCHAR = 52 - if data_type in [CDF_CHAR, CDF_UCHAR]: # Handle string types + if data_type in [CDFWriter.CDF_CHAR, CDFWriter.CDF_UCHAR]: # Handle string types num_elements = len(trace.data[0]) if len(trace.data) > 0 else 1 var_spec = { @@ -195,7 +195,7 @@ class ImagCDFFactory(TimeseriesFactory): 'Var_Type': 'zVariable', 'Dim_Sizes': [], 'Sparse': 'no_sparse', - 'Compress': 6, + 'Compress': 9, 'Pad': None, } var_attrs = self._create_var_attrs(trace, temperature_index, self.isUniqueTimes) @@ -294,7 +294,7 @@ class ImagCDFFactory(TimeseriesFactory): if os.path.isfile(url_file): try: # Read existing data to merge with new data - existing_cdf = cdflib.cdfread.CDF(url_file) + existing_cdf = CDFReader(url_file) existing_stream = self._read_cdf(existing_cdf) # existing_cdf.close() #no close method? existing_data = existing_stream @@ -371,7 +371,7 @@ class ImagCDFFactory(TimeseriesFactory): try: # Read CDF data and merge - cdf = cdflib.cdfread.CDF(url_file) + cdf = CDFReader(url_file) file_stream = self._read_cdf(cdf) # Attempt to select only requested channels selected = Stream() @@ -418,7 +418,7 @@ class ImagCDFFactory(TimeseriesFactory): descriptions. References: - - ImagCDF Documentation Section 4: ImagCDF Global Attributes + - ImagCDF Technical Documentation: ImagCDF Global Attributes """ stats = timeseries[0].stats if len(timeseries) > 0 else None @@ -458,10 +458,10 @@ class ImagCDFFactory(TimeseriesFactory): # 'UniqueIdentifier': {0: ''}, # 'ParentIdentifiers': {0: ''}, # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov - 'SensorSamplingRate': {0: sensor_sampling_rate}, - 'DataType': {0: data_type}, - 'Comments': {0: comments}, - 'DeclinationBase': {0: declination_base}, + 'SensorSamplingRate': {0: sensor_sampling_rate}, #Optional + 'DataType': {0: data_type},#Optional + 'Comments': {0: comments}, #Optional + 'DeclinationBase': {0: declination_base}, #Optional } return global_attrs @@ -470,7 +470,6 @@ class ImagCDFFactory(TimeseriesFactory): scalar_times = None temperature_times = {} temperature_index = 1 - print(timeseries[0].stats) for trace in timeseries: channel = trace.stats.channel @@ -615,9 +614,9 @@ class ImagCDFFactory(TimeseriesFactory): Create a dictionary of time variable attributes. These attributes provide metadata for time variables. - Note: None of these attributes are required for the time stamp variables GeomagneticVectorTimes and GeomagneticScalarTimes. + Note: None of these attributes are required for the time stamp variables. Reference: - - ImagCDF Documentation Section 3: ImagCDF Data + - ImagCDF Technical Documentation: ImagCDF Data """ # var_attrs = { # 'UNITS': 'TT2000', @@ -639,21 +638,18 @@ class ImagCDFFactory(TimeseriesFactory): - CDF_INT4 (41) for integer data. Reference: - - CDF Data Types: http://cdf.gsfc.nasa.gov/html/cdfdatatypes.html + - See CDF for more data types """ - # CDF data type constants - CDF_DOUBLE = 45 # CDF_DOUBLE corresponds to 64-bit float - CDF_INT4 = 41 # CDF_INT4 corresponds to 32-bit int if trace.data.dtype in [np.float32, np.float64]: - return CDF_DOUBLE + return CDFWriter.CDF_DOUBLE elif trace.data.dtype in [np.int32, np.int64]: - return CDF_INT4 + return CDFWriter.CDF_INT4 else: # Default to double precision float - return CDF_DOUBLE + return CDFWriter.CDF_DOUBLE - def _read_cdf(self, cdf: cdflib.cdfread.CDF) -> Stream: + def _read_cdf(self, cdf: CDFReader) -> Stream: """ Read CDF data into an ObsPy Stream. @@ -677,7 +673,7 @@ class ImagCDFFactory(TimeseriesFactory): institution = global_attrs.get('Institution', [''])[0] latitude = global_attrs.get('Latitude', [0.0])[0] longitude = global_attrs.get('Longitude', [0.0])[0] - elevation = global_attrs.get('Elevation', [99_999.0])[0] + elevation = global_attrs.get('Elevation', [99_999.0])[0] #default to 99_999 per technical documents. sensor_sampling_rate = global_attrs.get('SensorSamplingRate', [0.0])[0] sensor_orientation = global_attrs.get('VectorSensOrient', [''])[0] data_type = global_attrs.get('DataType', ['variation'])[0] @@ -816,7 +812,7 @@ class ImagCDFFactory(TimeseriesFactory): - The formatted file URL or path. Reference: - - ImagCDF Documentation Section 5: ImagCDF File Names + - ImagCDF Technical Documentation: ImagCDF File Names """ # Get the publication level for the type publication_level = IMCDFPublicationLevel(data_type=type).to_string() -- GitLab From efa2c0dba84474303572fee39926c03575049b85 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 9 Dec 2024 13:15:17 -0800 Subject: [PATCH 08/30] mvp for reading writing imagcdf --- geomagio/ImagCDFFactory.py | 407 +++++++++++++++++++++---------------- 1 file changed, 228 insertions(+), 179 deletions(-) diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index 3d8ba136..ea947903 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -91,8 +91,8 @@ class ImagCDFFactory(TimeseriesFactory): This class extends the TimeseriesFactory to support writing geomagnetic time series data to files in the ImagCDF format using the cdflib library. """ - - isUniqueTimes=True #used to determine depend_0 and CDF Time Variable Name + + isUniqueTimes = True # used to determine depend_0 and CDF Time Variable Name def __init__( self, @@ -129,20 +129,20 @@ class ImagCDFFactory(TimeseriesFactory): Note: Parsing from strings is not implemented in this factory. """ raise NotImplementedError('"parse_string" not implemented') - + def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data - with tempfile.NamedTemporaryFile(delete=False, suffix='.cdf') as tmp_file: + with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: tmp_file_path = tmp_file.name try: # Initialize the CDF writer cdf_spec = { - 'Compressed': 9, # Enable compression (0-9) - 'Majority': CDFWriter.ROW_MAJOR, # Data layout - gets set automatically - 'Encoding': CDFWriter.IBMPC_ENCODING, # gets set automatically - 'Checksum': True, # Disable checksum for faster writes (optional) - 'rDim_sizes': [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. + "Compressed": 9, # Enable compression (0-9) + "Majority": CDFWriter.ROW_MAJOR, # Data layout - gets set automatically + "Encoding": CDFWriter.IBMPC_ENCODING, # gets set automatically + "Checksum": True, # Disable checksum for faster writes (optional) + "rDim_sizes": [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } cdf_writer = CDFWriter(path=tmp_file_path, cdf_spec=cdf_spec, delete=True) @@ -152,53 +152,59 @@ class ImagCDFFactory(TimeseriesFactory): cdf_writer.write_globalattrs(global_attrs) # Time variables - time_vars = self._create_time_stamp_variables(timeseries) #modifies self.isUniqueTimes + time_vars = self._create_time_stamp_variables( + timeseries + ) # modifies self.isUniqueTimes for ts_name, ts_data in time_vars.items(): # Define time variable specification var_spec = { - 'Variable': ts_name, - 'Data_Type': CDFWriter.CDF_TIME_TT2000, # CDF_TIME_TT2000 - 'Num_Elements': 1, - 'Rec_Vary': True, - 'Var_Type': 'zVariable', - 'Dim_Sizes': [], - 'Sparse': 'no_sparse', - 'Compress': 9, - 'Pad': None, + "Variable": ts_name, + "Data_Type": CDFWriter.CDF_TIME_TT2000, # CDF_TIME_TT2000 + "Num_Elements": 1, + "Rec_Vary": True, + "Var_Type": "zVariable", + "Dim_Sizes": [], + "Sparse": "no_sparse", + "Compress": 9, + "Pad": None, } # Define time variable attributes var_attrs = self._create_time_var_attrs(ts_name) # Write time variable cdf_writer.write_var(var_spec, var_attrs, ts_data) - - + # Data variables temperature_index = 0 for trace in timeseries: channel = trace.stats.channel if channel in TEMPERATURE_ELEMENTS_ID: - temperature_index += 1 #MUST INCREMENT INDEX BEFORE USING + temperature_index += 1 # MUST INCREMENT INDEX BEFORE USING var_name = f"Temperature{temperature_index}" else: var_name = f"GeomagneticField{channel}" data_type = self._get_cdf_data_type(trace) num_elements = 1 - if data_type in [CDFWriter.CDF_CHAR, CDFWriter.CDF_UCHAR]: # Handle string types + if data_type in [ + CDFWriter.CDF_CHAR, + CDFWriter.CDF_UCHAR, + ]: # Handle string types num_elements = len(trace.data[0]) if len(trace.data) > 0 else 1 - + var_spec = { - 'Variable': var_name, - 'Data_Type': data_type, - 'Num_Elements': num_elements, - 'Rec_Vary': True, - 'Var_Type': 'zVariable', - 'Dim_Sizes': [], - 'Sparse': 'no_sparse', - 'Compress': 9, - 'Pad': None, + "Variable": var_name, + "Data_Type": data_type, + "Num_Elements": num_elements, + "Rec_Vary": True, + "Var_Type": "zVariable", + "Dim_Sizes": [], + "Sparse": "no_sparse", + "Compress": 9, + "Pad": None, } - var_attrs = self._create_var_attrs(trace, temperature_index, self.isUniqueTimes) + var_attrs = self._create_var_attrs( + trace, temperature_index, self.isUniqueTimes + ) # Write data variable cdf_writer.write_var(var_spec, var_attrs, trace.data) @@ -271,7 +277,7 @@ class ImagCDFFactory(TimeseriesFactory): ) # Handle 'stdout' output - if url == 'stdout': + if url == "stdout": # Write directly to stdout fh = sys.stdout.buffer url_data = timeseries.slice( @@ -282,7 +288,7 @@ class ImagCDFFactory(TimeseriesFactory): continue # Proceed to next interval if any # Handle 'file://' output - elif url.startswith('file://'): + elif url.startswith("file://"): # Get the file path from the URL url_file = Util.get_file_from_url(url, createParentDirectory=False) url_data = timeseries.slice( @@ -307,11 +313,16 @@ class ImagCDFFactory(TimeseriesFactory): channel=trace.stats.channel, ) if new_trace: - trace.data = np.concatenate((trace.data, new_trace[0].data)) + trace.data = np.concatenate( + (trace.data, new_trace[0].data) + ) url_data = existing_data + url_data except Exception as e: # Log the exception if needed - print(f"Warning: Could not read existing CDF file '{url_file}': {e}", file=sys.stderr) + print( + f"Warning: Could not read existing CDF file '{url_file}': {e}", + file=sys.stderr, + ) # Proceed with new data # Pad the data with NaNs to ensure it fits the interval @@ -329,7 +340,9 @@ class ImagCDFFactory(TimeseriesFactory): else: # Unsupported URL scheme encountered - raise TimeseriesFactoryException("Unsupported URL scheme in urlTemplate") + raise TimeseriesFactoryException( + "Unsupported URL scheme in urlTemplate" + ) def get_timeseries( self, @@ -359,10 +372,12 @@ class ImagCDFFactory(TimeseriesFactory): interval=interval, channels=channels, ) - if url == 'stdout': + if url == "stdout": continue # stdout is not a valid input source if not url.startswith("file://"): - raise TimeseriesFactoryException("Only file urls are supported for reading ImagCDF") + raise TimeseriesFactoryException( + "Only file urls are supported for reading ImagCDF" + ) url_file = Util.get_file_from_url(url, createParentDirectory=False) if not os.path.isfile(url_file): @@ -409,7 +424,9 @@ class ImagCDFFactory(TimeseriesFactory): timeseries.sort() return timeseries - def _create_global_attributes(self, timeseries: Stream, channels: List[str]) -> dict: + def _create_global_attributes( + self, timeseries: Stream, channels: List[str] + ) -> dict: """ Create a dictionary of global attributes for the ImagCDF file. @@ -423,45 +440,58 @@ class ImagCDFFactory(TimeseriesFactory): stats = timeseries[0].stats if len(timeseries) > 0 else None # Extract metadata from stats or fallback to defaults - observatory_name = getattr(stats, 'station_name', None) or self.observatory or "" - station = getattr(stats, 'station', None) or "" - institution = getattr(stats, 'agency_name', None) or "" - latitude = getattr(stats, 'geodetic_latitude', None) or 0.0 - longitude = getattr(stats, 'geodetic_longitude', None) or 0.0 - elevation = getattr(stats, 'elevation', None) or 99_999.0 - conditions_of_use = getattr(stats, 'conditions_of_use', None) or "" - vector_orientation = getattr(stats, 'sensor_orientation', None) or "" - data_interval_type = getattr(stats, 'data_interval_type', None) or self.interval - data_type = getattr(stats, 'data_type', None) or "variation" - sensor_sampling_rate = getattr(stats, 'sensor_sampling_rate', None) or 0.0 - comments = getattr(stats, 'filter_comments', None) or [''] - declination_base = getattr(stats, 'declination_base', None) or 0.0 + observatory_name = ( + getattr(stats, "station_name", None) or self.observatory or "" + ) + station = getattr(stats, "station", None) or "" + institution = getattr(stats, "agency_name", None) or "" + latitude = getattr(stats, "geodetic_latitude", None) or 0.0 + longitude = getattr(stats, "geodetic_longitude", None) or 0.0 + elevation = getattr(stats, "elevation", None) or 99_999.0 + conditions_of_use = getattr(stats, "conditions_of_use", None) or "" + vector_orientation = getattr(stats, "sensor_orientation", None) or "" + data_interval_type = getattr(stats, "data_interval_type", None) or self.interval + data_type = getattr(stats, "data_type", None) or "variation" + sensor_sampling_rate = getattr(stats, "sensor_sampling_rate", None) or 0.0 + comments = getattr(stats, "filter_comments", None) or [""] + declination_base = getattr(stats, "declination_base", None) or 0.0 publication_level = IMCDFPublicationLevel(data_type=self.type).to_string() global_attrs = { - 'FormatDescription': {0: 'INTERMAGNET CDF Format'}, - 'FormatVersion': {0: '1.2'}, - 'Title': {0: 'Geomagnetic time series data'}, - 'IagaCode': {0: station}, - 'ElementsRecorded': {0: ''.join(channels)}, - 'PublicationLevel': {0: publication_level}, - 'PublicationDate': {0: [cdflib.cdfepoch.timestamp_to_tt2000(datetime.timestamp(datetime.now(timezone.utc))), "cdf_time_tt2000"]}, - 'ObservatoryName': {0: observatory_name}, - 'Latitude': {0: np.array([latitude], dtype=np.float64)}, - 'Longitude': {0: np.array([longitude], dtype=np.float64)}, - 'Elevation': {0: np.array([elevation], dtype=np.float64)}, - 'Institution': {0: institution}, - 'VectorSensOrient': {0: vector_orientation}, #remove F - because its a calculation, not an element? - 'StandardLevel': {0: 'None'}, # Set to 'None' + "FormatDescription": {0: "INTERMAGNET CDF Format"}, + "FormatVersion": {0: "1.2"}, + "Title": {0: "Geomagnetic time series data"}, + "IagaCode": {0: station}, + "ElementsRecorded": {0: "".join(channels)}, + "PublicationLevel": {0: publication_level}, + "PublicationDate": { + 0: [ + cdflib.cdfepoch.timestamp_to_tt2000( + datetime.timestamp(datetime.now(timezone.utc)) + ), + "cdf_time_tt2000", + ] + }, + "ObservatoryName": {0: observatory_name}, + "Latitude": {0: np.array([latitude], dtype=np.float64)}, + "Longitude": {0: np.array([longitude], dtype=np.float64)}, + "Elevation": {0: np.array([elevation], dtype=np.float64)}, + "Institution": {0: institution}, + "VectorSensOrient": { + 0: vector_orientation + }, # remove F - because its a calculation, not an element? + "StandardLevel": {0: "None"}, # Set to 'None' # Temporarily Omit 'StandardName', 'StandardVersion', 'PartialStandDesc' - 'Source': {0: 'institute'}, # "institute" - if the named institution provided the data, “INTERMAGNET†- if the data file has been created by INTERMAGNET from another data source, “WDC†- if the World Data Centre has created the file from another data source - 'TermsOfUse': {0: conditions_of_use}, + "Source": { + 0: "institute" + }, # "institute" - if the named institution provided the data, “INTERMAGNET†- if the data file has been created by INTERMAGNET from another data source, “WDC†- if the World Data Centre has created the file from another data source + "TermsOfUse": {0: conditions_of_use}, # 'UniqueIdentifier': {0: ''}, # 'ParentIdentifiers': {0: ''}, - # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov - 'SensorSamplingRate': {0: sensor_sampling_rate}, #Optional - 'DataType': {0: data_type},#Optional - 'Comments': {0: comments}, #Optional - 'DeclinationBase': {0: declination_base}, #Optional + # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov + "SensorSamplingRate": {0: sensor_sampling_rate}, # Optional + "DataType": {0: data_type}, # Optional + "Comments": {0: comments}, # Optional + "DeclinationBase": {0: declination_base}, # Optional } return global_attrs @@ -474,24 +504,32 @@ class ImagCDFFactory(TimeseriesFactory): for trace in timeseries: channel = trace.stats.channel times = [ - (trace.stats.starttime + trace.stats.delta * i).datetime.replace(tzinfo=timezone.utc) + (trace.stats.starttime + trace.stats.delta * i).datetime.replace( + tzinfo=timezone.utc + ) for i in range(trace.stats.npts) ] # Convert timestamps to TT2000 format required by CDF - tt2000_times = cdflib.cdfepoch.timestamp_to_tt2000([time.timestamp() for time in times]) + tt2000_times = cdflib.cdfepoch.timestamp_to_tt2000( + [time.timestamp() for time in times] + ) if channel in self._get_vector_elements(): if vector_times is None: vector_times = tt2000_times else: if not np.array_equal(vector_times, tt2000_times): - raise ValueError("Time stamps for vector channels are not the same.") + raise ValueError( + "Time stamps for vector channels are not the same." + ) elif channel in self._get_scalar_elements(): if scalar_times is None: scalar_times = tt2000_times else: if not np.array_equal(scalar_times, tt2000_times): - raise ValueError("Time stamps for scalar channels are not the same.") + raise ValueError( + "Time stamps for scalar channels are not the same." + ) elif channel in TEMPERATURE_ELEMENTS_ID: ts_key = f"Temperature{temperature_index}Times" if ts_key not in temperature_times: @@ -502,21 +540,22 @@ class ImagCDFFactory(TimeseriesFactory): time_vars = {} if vector_times is not None: - time_vars['GeomagneticVectorTimes'] = vector_times + time_vars["GeomagneticVectorTimes"] = vector_times if scalar_times is not None: - time_vars['GeomagneticScalarTimes'] = scalar_times + time_vars["GeomagneticScalarTimes"] = scalar_times if temperature_times: time_vars.update(temperature_times) - + last_times = [] - self.isUniqueTimes = len(time_vars) == 1 #true if only one set of times, else default to false. + self.isUniqueTimes = ( + len(time_vars) == 1 + ) # true if only one set of times, else default to false. for index, times in enumerate(time_vars.values()): if index > 0: self.isUniqueTimes = not np.array_equal(last_times, times) last_times = times - - return time_vars if self.isUniqueTimes else {"DataTimes": last_times} + return time_vars if self.isUniqueTimes else {"DataTimes": last_times} def _create_var_spec( self, @@ -549,63 +588,69 @@ class ImagCDFFactory(TimeseriesFactory): - CDF User's Guide: Variable Specification """ var_spec = { - 'Variable': var_name, - 'Data_Type': data_type, - 'Num_Elements': num_elements, - 'Rec_Vary': True, - 'Var_Type': var_type, - 'Dim_Sizes': dim_sizes, - 'Sparse': 'no_sparse' if not sparse else 'pad_sparse', - 'Compress': compress, - 'Pad': pad, + "Variable": var_name, + "Data_Type": data_type, + "Num_Elements": num_elements, + "Rec_Vary": True, + "Var_Type": var_type, + "Dim_Sizes": dim_sizes, + "Sparse": "no_sparse" if not sparse else "pad_sparse", + "Compress": compress, + "Pad": pad, } return var_spec - def _create_var_attrs(self, trace: Trace, temperature_index: Optional[int] = None, isUniqueTimes: Optional[bool] = True) -> dict: + def _create_var_attrs( + self, + trace: Trace, + temperature_index: Optional[int] = None, + isUniqueTimes: Optional[bool] = True, + ) -> dict: channel = trace.stats.channel.upper() - fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. - units = '' # Must be one of “nTâ€, “Degrees of arc†or “Celsius†- if channel == 'D': - units = 'Degrees of arc' - validmin = -360.0 - validmax = 360.0 # A full circle representation - elif channel == 'I': - units = 'Degrees of arc' - validmin = -90.0 - validmax = 90.0 #The magnetic field vector can point straight down (+90°), horizontal (0°), or straight up (-90°). + fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. + units = "" # Must be one of “nTâ€, “Degrees of arc†or “Celsius†+ if channel == "D": + units = "Degrees of arc" + validmin = -360.0 + validmax = 360.0 # A full circle representation + elif channel == "I": + units = "Degrees of arc" + validmin = -90.0 + validmax = 90.0 # The magnetic field vector can point straight down (+90°), horizontal (0°), or straight up (-90°). elif channel in TEMPERATURE_ELEMENTS_ID: - units = 'Celsius' + units = "Celsius" fieldnam = f"Temperature {temperature_index} {trace.stats.location}" - validmin = -273.15 #absolute zero + validmin = -273.15 # absolute zero validmax = 79_999 - elif channel in ['F','S']: - units = 'nT' - validmin = 0.0 # negative magnetic field intestity not physically meaningful. + elif channel in ["F", "S"]: + units = "nT" + validmin = ( + 0.0 # negative magnetic field intestity not physically meaningful. + ) validmax = 79_999.0 - elif channel in ['X', 'Y', 'Z', 'H', 'E', 'V', 'G']: - units = 'nT' + elif channel in ["X", "Y", "Z", "H", "E", "V", "G"]: + units = "nT" validmin = -79_999.0 validmax = 79_999.0 # Determine DEPEND_0 based on channel type if channel in self._get_vector_elements(): - depend_0 = 'GeomagneticVectorTimes' + depend_0 = "GeomagneticVectorTimes" elif channel in self._get_scalar_elements(): - depend_0 = 'GeomagneticScalarTimes' + depend_0 = "GeomagneticScalarTimes" elif channel in TEMPERATURE_ELEMENTS_ID: depend_0 = f"Temperature{temperature_index}Times" - var_attrs = { - 'FIELDNAM': fieldnam, - 'UNITS': units, - 'FILLVAL': 99999.0, - 'VALIDMIN': validmin, - 'VALIDMAX': validmax, - 'DEPEND_0': depend_0 if isUniqueTimes else "DataTimes", - 'DISPLAY_TYPE': 'time_series', - 'LABLAXIS': channel, - 'DATA_INTERVAL_TYPE': trace.stats.data_interval_type + "FIELDNAM": fieldnam, + "UNITS": units, + "FILLVAL": 99999.0, + "VALIDMIN": validmin, + "VALIDMAX": validmax, + "DEPEND_0": depend_0 if isUniqueTimes else "DataTimes", + "DISPLAY_TYPE": "time_series", + "LABLAXIS": channel, + "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, } return var_attrs @@ -619,9 +664,9 @@ class ImagCDFFactory(TimeseriesFactory): - ImagCDF Technical Documentation: ImagCDF Data """ # var_attrs = { - # 'UNITS': 'TT2000', - # 'DISPLAY_TYPE': 'time_series', - # 'LABLAXIS': 'Time', + # 'UNITS': 'TT2000', + # 'DISPLAY_TYPE': 'time_series', + # 'LABLAXIS': 'Time', # } # return var_attrs return {} @@ -666,26 +711,28 @@ class ImagCDFFactory(TimeseriesFactory): # Extract global attributes global_attrs = cdf.globalattsget() - + # Map global attributes to Stream-level metadata - observatory = global_attrs.get('IagaCode', [''])[0] - station_name = global_attrs.get('ObservatoryName', [''])[0] - institution = global_attrs.get('Institution', [''])[0] - latitude = global_attrs.get('Latitude', [0.0])[0] - longitude = global_attrs.get('Longitude', [0.0])[0] - elevation = global_attrs.get('Elevation', [99_999.0])[0] #default to 99_999 per technical documents. - sensor_sampling_rate = global_attrs.get('SensorSamplingRate', [0.0])[0] - sensor_orientation = global_attrs.get('VectorSensOrient', [''])[0] - data_type = global_attrs.get('DataType', ['variation'])[0] - publication_level = global_attrs.get('PublicationLevel', ['1'])[0] - comments = global_attrs.get('Comments', ['']) - terms_of_use = global_attrs.get('TermsOfUse', [''])[0] - declination_base = global_attrs.get('DeclinationBase', [0.0])[0] + observatory = global_attrs.get("IagaCode", [""])[0] + station_name = global_attrs.get("ObservatoryName", [""])[0] + institution = global_attrs.get("Institution", [""])[0] + latitude = global_attrs.get("Latitude", [0.0])[0] + longitude = global_attrs.get("Longitude", [0.0])[0] + elevation = global_attrs.get("Elevation", [99_999.0])[ + 0 + ] # default to 99_999 per technical documents. + sensor_sampling_rate = global_attrs.get("SensorSamplingRate", [0.0])[0] + sensor_orientation = global_attrs.get("VectorSensOrient", [""])[0] + data_type = global_attrs.get("DataType", ["variation"])[0] + publication_level = global_attrs.get("PublicationLevel", ["1"])[0] + comments = global_attrs.get("Comments", [""]) + terms_of_use = global_attrs.get("TermsOfUse", [""])[0] + declination_base = global_attrs.get("DeclinationBase", [0.0])[0] # Identify all time variables time_vars = {} for var in cdf.cdf_info().zVariables: - if var.lower().endswith('times'): + if var.lower().endswith("times"): time_data = cdf.varget(var) unix_times = cdflib.cdfepoch.unixtime(time_data) utc_times = [UTCDateTime(t) for t in unix_times] @@ -694,14 +741,14 @@ class ImagCDFFactory(TimeseriesFactory): # Read data variables and associate them with time variables for var in cdf.cdf_info().zVariables: # Skip time variables - if var.lower().endswith('times'): + if var.lower().endswith("times"): continue data = cdf.varget(var) attrs = cdf.varattsget(var) # Determine DEPEND_0 (the time variable name) and validate - ts_name = attrs.get('DEPEND_0') + ts_name = attrs.get("DEPEND_0") if not ts_name: # If no DEPEND_0, skip this variable as we cannot map times continue @@ -728,15 +775,15 @@ class ImagCDFFactory(TimeseriesFactory): # Determine delta (sample interval) if len(times) > 1: # delta as a float of seconds - delta = (times[1].timestamp - times[0].timestamp) + delta = times[1].timestamp - times[0].timestamp else: # if only one sample, use default based on interval # convert minute, second, etc. to numeric delta - if self.interval == 'minute': + if self.interval == "minute": delta = 60.0 - elif self.interval == 'second': + elif self.interval == "second": delta = 1.0 - elif self.interval == 'hour': + elif self.interval == "hour": delta = 3600.0 else: # fallback, set delta to 60 @@ -751,34 +798,34 @@ class ImagCDFFactory(TimeseriesFactory): elif var.startswith("Temperature"): # Temperature variables may not map directly to a geomagnetic channel # but to temperature sensors. We can just use the label from LABLAXIS if needed - channel = attrs.get('LABLAXIS', var) + channel = attrs.get("LABLAXIS", var) else: # fallback if naming doesn't match expected patterns channel = var - time_attrs =cdf.varattsget(var) - data_interval = time_attrs.get('DATA_INTERVAL_TYPE', ['']) + time_attrs = cdf.varattsget(var) + data_interval = time_attrs.get("DATA_INTERVAL_TYPE", [""]) # Create a trace trace = Trace( data=data, header={ - 'station': observatory, - 'channel': channel, - 'starttime': times[0], - 'delta': delta, - 'geodetic_latitude': latitude, - 'geodetic_longitude': longitude, - 'elevation': elevation, - 'sensor_orientation': "".join(sensor_orientation), - 'data_type': data_type, - 'station_name': station_name, - 'agency_name': institution, - 'conditions_of_use': terms_of_use, - 'sensor_sampling_rate': sensor_sampling_rate, - 'data_interval_type': data_interval, - 'declination_base': declination_base, - 'filter_comments': comments - } + "station": observatory, + "channel": channel, + "starttime": times[0], + "delta": delta, + "geodetic_latitude": latitude, + "geodetic_longitude": longitude, + "elevation": elevation, + "sensor_orientation": "".join(sensor_orientation), + "data_type": data_type, + "station_name": station_name, + "agency_name": institution, + "conditions_of_use": terms_of_use, + "sensor_sampling_rate": sensor_sampling_rate, + "data_interval_type": data_interval, + "declination_base": declination_base, + "filter_comments": comments, + }, ) stream += trace @@ -797,8 +844,8 @@ class ImagCDFFactory(TimeseriesFactory): This method constructs the filename based on the ImagCDF naming conventions, which include the observatory code, date-time formatted - according to the data interval, and the publication level. - + according to the data interval, and the publication level. + [iaga-code]_[date-time]_[publication-level].cdf Parameters: @@ -831,7 +878,9 @@ class ImagCDFFactory(TimeseriesFactory): elif interval == "second": date_format = date.strftime("%Y%m%d_%H%M%S") else: - raise ValueError(f"Unsupported interval: {interval}") #tenhertz currently not supported + raise ValueError( + f"Unsupported interval: {interval}" + ) # tenhertz currently not supported # Filename following ImagCDF convention, see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#imagcdf-file-names filename = f"{observatory.lower()}_{date_format}_{publication_level}.cdf" @@ -859,11 +908,13 @@ class ImagCDFFactory(TimeseriesFactory): } # Attempt to use the template provided in urlTemplate - if "{" in self.urlTemplate and "}" in self.urlTemplate: + if "{" in self.urlTemplate and "}" in self.urlTemplate: try: return self.urlTemplate.format(**params) except KeyError as e: - raise TimeseriesFactoryException(f"Invalid placeholder in urlTemplate: {e}") + raise TimeseriesFactoryException( + f"Invalid placeholder in urlTemplate: {e}" + ) # If the urlTemplate doesn't support placeholders, assume 'file://' scheme if self.urlTemplate.startswith("file://"): @@ -878,9 +929,7 @@ class ImagCDFFactory(TimeseriesFactory): ) def _get_vector_elements(self): - return {'X', 'Y', 'Z', 'H', 'D', 'E', 'V', 'I', 'F'} - - def _get_scalar_elements(self): - return {'S', 'G'} - + return {"X", "Y", "Z", "H", "D", "E", "V", "I", "F"} + def _get_scalar_elements(self): + return {"S", "G"} -- GitLab From 8a4949e890dfc2b561dfbbb927d5192a26a75a3c Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 9 Dec 2024 13:45:42 -0800 Subject: [PATCH 09/30] lint + poetry clean up --- geomagio/api/ws/Element.py | 4 ++- poetry.lock | 69 +++++++++++++------------------------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/geomagio/api/ws/Element.py b/geomagio/api/ws/Element.py index 25b876c5..e5ab3e62 100644 --- a/geomagio/api/ws/Element.py +++ b/geomagio/api/ws/Element.py @@ -47,4 +47,6 @@ ELEMENTS = [ ELEMENT_INDEX = {e.id: e for e in ELEMENTS} -TEMPERATURE_ELEMENTS_ID = [element.id for element in ELEMENTS if "Temperature" in element.name] \ No newline at end of file +TEMPERATURE_ELEMENTS_ID = [ + element.id for element in ELEMENTS if "Temperature" in element.name +] diff --git a/poetry.lock b/poetry.lock index 90b76110..56e23f59 100644 --- a/poetry.lock +++ b/poetry.lock @@ -38,13 +38,13 @@ docs = ["sphinx (==7.2.6)", "sphinx-mdinclude (==0.5.3)"] [[package]] name = "alembic" -version = "1.14.0" +version = "1.13.3" description = "A database migration tool for SQLAlchemy." optional = false python-versions = ">=3.8" files = [ - {file = "alembic-1.14.0-py3-none-any.whl", hash = "sha256:99bd884ca390466db5e27ffccff1d179ec5c05c965cfefc0607e69f9e411cb25"}, - {file = "alembic-1.14.0.tar.gz", hash = "sha256:b00892b53b3642d0b8dbedba234dbf1924b69be83a9a769d5a624b01094e304b"}, + {file = "alembic-1.13.3-py3-none-any.whl", hash = "sha256:908e905976d15235fae59c9ac42c4c5b75cfcefe3d27c0fbf7ae15a37715d80e"}, + {file = "alembic-1.13.3.tar.gz", hash = "sha256:203503117415561e203aa14541740643a611f641517f0209fcae63e9fa09f1a2"}, ] [package.dependencies] @@ -153,25 +153,6 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] -[[package]] -name = "cdflib" -version = "1.3.2" -description = "A python CDF reader toolkit" -optional = false -python-versions = ">=3.8" -files = [ - {file = "cdflib-1.3.2-py3-none-any.whl", hash = "sha256:49af97acc328c586ac5b7c27fd8e67bccf24af82c5bd8c37d8cfe048a0c1752a"}, - {file = "cdflib-1.3.2.tar.gz", hash = "sha256:97f27ac629e4c0ac1367eb8f4edd7a1d184190272ab98a6401e999f3a2e05687"}, -] - -[package.dependencies] -numpy = ">=1.21" - -[package.extras] -dev = ["ipython", "pre-commit"] -docs = ["astropy", "netcdf4", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-rtd-theme", "xarray"] -tests = ["astropy", "h5netcdf", "hypothesis", "netcdf4", "pytest (>=3.9)", "pytest-cov", "pytest-remotedata", "xarray"] - [[package]] name = "certifi" version = "2024.12.14" @@ -711,13 +692,13 @@ files = [ [[package]] name = "dparse" -version = "0.6.4" +version = "0.6.3" description = "A parser for Python dependency files" optional = false -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "dparse-0.6.4-py3-none-any.whl", hash = "sha256:fbab4d50d54d0e739fbb4dedfc3d92771003a5b9aa8545ca7a7045e3b174af57"}, - {file = "dparse-0.6.4.tar.gz", hash = "sha256:90b29c39e3edc36c6284c82c4132648eaf28a01863eb3c231c2512196132201a"}, + {file = "dparse-0.6.3-py3-none-any.whl", hash = "sha256:0d8fe18714056ca632d98b24fbfc4e9791d4e47065285ab486182288813a5318"}, + {file = "dparse-0.6.3.tar.gz", hash = "sha256:27bb8b4bcaefec3997697ba3f6e06b2447200ba273c0b085c3d012a04571b528"}, ] [package.dependencies] @@ -725,20 +706,18 @@ packaging = "*" tomli = {version = "*", markers = "python_version < \"3.11\""} [package.extras] -all = ["pipenv", "poetry", "pyyaml"] conda = ["pyyaml"] -pipenv = ["pipenv"] -poetry = ["poetry"] +pipenv = ["pipenv (<=2022.12.19)"] [[package]] name = "et-xmlfile" -version = "2.0.0" +version = "1.1.0" description = "An implementation of lxml.xmlfile for the standard library" optional = false -python-versions = ">=3.8" +python-versions = ">=3.6" files = [ - {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, - {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, ] [[package]] @@ -1795,13 +1774,13 @@ typing-extensions = ">=3.7.4" [[package]] name = "packaging" -version = "24.2" +version = "24.1" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, - {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] [[package]] @@ -2499,23 +2478,23 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo [[package]] name = "setuptools" -version = "75.3.0" +version = "75.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, - {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, + {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"}, + {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"}, ] [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] -core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] [[package]] name = "six" @@ -2645,13 +2624,13 @@ SQLAlchemy = ">=0.9.0" [[package]] name = "starlette" -version = "0.41.3" +version = "0.41.0" description = "The little ASGI library that shines." optional = false python-versions = ">=3.8" files = [ - {file = "starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"}, - {file = "starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835"}, + {file = "starlette-0.41.0-py3-none-any.whl", hash = "sha256:a0193a3c413ebc9c78bff1c3546a45bb8c8bcb4a84cae8747d650a65bd37210a"}, + {file = "starlette-0.41.0.tar.gz", hash = "sha256:39cbd8768b107d68bfe1ff1672b38a2c38b49777de46d2a592841d58e3bf7c2a"}, ] [package.dependencies] -- GitLab From f2e1ec26e0946a9eba4ed660ce7da82923927764 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 9 Dec 2024 13:46:21 -0800 Subject: [PATCH 10/30] lint + poetry clean up --- poetry.lock | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/poetry.lock b/poetry.lock index 56e23f59..a4aac078 100644 --- a/poetry.lock +++ b/poetry.lock @@ -153,6 +153,25 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "cdflib" +version = "1.3.2" +description = "A python CDF reader toolkit" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cdflib-1.3.2-py3-none-any.whl", hash = "sha256:49af97acc328c586ac5b7c27fd8e67bccf24af82c5bd8c37d8cfe048a0c1752a"}, + {file = "cdflib-1.3.2.tar.gz", hash = "sha256:97f27ac629e4c0ac1367eb8f4edd7a1d184190272ab98a6401e999f3a2e05687"}, +] + +[package.dependencies] +numpy = ">=1.21" + +[package.extras] +dev = ["ipython", "pre-commit"] +docs = ["astropy", "netcdf4", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-rtd-theme", "xarray"] +tests = ["astropy", "h5netcdf", "hypothesis", "netcdf4", "pytest (>=3.9)", "pytest-cov", "pytest-remotedata", "xarray"] + [[package]] name = "certifi" version = "2024.12.14" -- GitLab From d6a2066dbb30f65e2f64b325e4694481de0c7a76 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Wed, 11 Dec 2024 10:06:53 -0800 Subject: [PATCH 11/30] parse_string method implemented for streaming cdf file via input-file command --- geomagio/Controller.py | 6 ++--- geomagio/ImagCDFFactory.py | 51 ++++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 96ebb28c..0985597c 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -503,7 +503,7 @@ def get_input_factory(args): input_type = args.input # stream/url arguments if args.input_file is not None: - if input_type in ["netcdf", "miniseed"]: + if input_type in ["netcdf", "miniseed", "imagcdf"]: input_stream = open(args.input_file, "rb") else: input_stream = open(args.input_file, "r") @@ -547,8 +547,6 @@ def get_input_factory(args): locationCode=args.locationcode, **input_factory_args, ) - elif input_type == "imagcdf": - input_factory = ImagCDFFactory(**input_factory_args) else: # stream compatible factories if input_type == "iaga2002": @@ -573,6 +571,8 @@ def get_input_factory(args): input_factory = xml.XMLFactory(**input_factory_args) elif input_type == "covjson": input_factory = covjson.CovJSONFactory(**input_factory_args) + elif input_type == "imagcdf": + input_factory = ImagCDFFactory(**input_factory_args) # wrap stream if input_stream is not None: input_factory = StreamTimeseriesFactory( diff --git a/geomagio/ImagCDFFactory.py b/geomagio/ImagCDFFactory.py index ea947903..af603201 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/ImagCDFFactory.py @@ -164,7 +164,7 @@ class ImagCDFFactory(TimeseriesFactory): "Rec_Vary": True, "Var_Type": "zVariable", "Dim_Sizes": [], - "Sparse": "no_sparse", + "Sparse": "no_sparse", # no_sparse because there should not be time gaps. "Compress": 9, "Pad": None, } @@ -424,6 +424,47 @@ class ImagCDFFactory(TimeseriesFactory): timeseries.sort() return timeseries + def parse_string(self, data: str, **kwargs): + """ + Parse ImagCDF binary data into an ObsPy Stream. + + This method writes the provided binary data to a temporary file, + reads the file using `cdflib`, and converts the data into an ObsPy + Stream. + + Parameters + ---------- + data : bytes + Binary data containing ImagCDF content. + + Returns + ------- + Stream + An ObsPy Stream object with the parsed geomagnetic time series data. + + Raises + ------ + TimeseriesFactoryException + If an error occurs while parsing the ImagCDF data. + """ + # Create a temporary file to store the CDF data + with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: + tmp_file_name = tmp_file.name + tmp_file.write(data) + + try: + # Read the CDF from the temporary file + cdf = CDFReader(tmp_file_name) + stream = self._read_cdf(cdf) + # no cdf.close() method required + except Exception as e: + raise TimeseriesFactoryException(f"Error parsing ImagCDF data: {e}") + finally: + # Clean up the temporary file + os.remove(tmp_file_name) + + return stream + def _create_global_attributes( self, timeseries: Stream, channels: List[str] ) -> dict: @@ -753,18 +794,12 @@ class ImagCDFFactory(TimeseriesFactory): # If no DEPEND_0, skip this variable as we cannot map times continue - # If we used a DataTimes fallback or similar, ensure we handle it case-insensitively - # and also confirm that time_vars keys are checked properly. - # The ImagCDF can have DataTimes, GeomagneticVectorTimes, etc. - # So try exact match first, if not found, attempt case-insensitive. + # The ImagCDF can have DataTimes, GeomagneticVectorTimes, GeomagneticScalarTimes, TemperatureNTimes (for N > 0), etc. matched_time_key = None for tkey in time_vars.keys(): if tkey == ts_name: matched_time_key = tkey break - if tkey.lower() == ts_name.lower(): - matched_time_key = tkey - break if matched_time_key not in time_vars: # If we cannot find the matching time variable, skip this variable -- GitLab From 72bbdf58a81acbb3635366d96602b67fd274f1fa Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 12 Dec 2024 12:31:37 -0800 Subject: [PATCH 12/30] tweaks to accept output_args. temperature-type elements as a nonstandard geomagnetic field element instead of as Temperature cdf variable --- geomagio/Controller.py | 6 +- geomagio/{ => imagcdf}/ImagCDFFactory.py | 97 ++++++++++++++---------- 2 files changed, 61 insertions(+), 42 deletions(-) rename geomagio/{ => imagcdf}/ImagCDFFactory.py (93%) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 0985597c..4e212579 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -7,7 +7,7 @@ from typing import List, Optional, Tuple, Union from obspy.core import Stream, UTCDateTime -from geomagio.ImagCDFFactory import ImagCDFFactory +from geomagio.imagcdf.ImagCDFFactory import ImagCDFFactory from .algorithm import Algorithm, algorithms, AlgorithmException, FilterAlgorithm from .DerivedTimeseriesFactory import DerivedTimeseriesFactory @@ -28,7 +28,7 @@ from . import vbf from . import xml from . import covjson from . import netcdf -from geomagio.ImagCDFFactory import ImagCDFFactory +from geomagio.imagcdf.ImagCDFFactory import ImagCDFFactory class Controller(object): @@ -635,7 +635,7 @@ def get_output_factory(args): elif output_type == "plot": output_factory = PlotTimeseriesFactory() elif output_type == "imagcdf": - output_factory = ImagCDFFactory() + output_factory = ImagCDFFactory(**output_factory_args) else: # stream compatible factories if output_type == "binlog": diff --git a/geomagio/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py similarity index 93% rename from geomagio/ImagCDFFactory.py rename to geomagio/imagcdf/ImagCDFFactory.py index af603201..a7838f14 100644 --- a/geomagio/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -25,10 +25,10 @@ from sqlalchemy import true from geomagio.TimeseriesFactory import TimeseriesFactory from geomagio.api.ws.Element import TEMPERATURE_ELEMENTS_ID -from .geomag_types import DataInterval, DataType -from .TimeseriesFactoryException import TimeseriesFactoryException -from . import TimeseriesUtility -from . import Util +from ..geomag_types import DataInterval, DataType +from ..TimeseriesFactoryException import TimeseriesFactoryException +from .. import TimeseriesUtility +from .. import Util import cdflib from cdflib.cdfwrite import CDF as CDFWriter @@ -93,6 +93,7 @@ class ImagCDFFactory(TimeseriesFactory): """ isUniqueTimes = True # used to determine depend_0 and CDF Time Variable Name + NONSTANDARD_ELEMENTS = TEMPERATURE_ELEMENTS_ID def __init__( self, @@ -123,13 +124,6 @@ class ImagCDFFactory(TimeseriesFactory): urlInterval=urlInterval, ) - def parse_string(self, data: str, **kwargs): - """Parse ImagCDF formatted string data into a Stream. - - Note: Parsing from strings is not implemented in this factory. - """ - raise NotImplementedError('"parse_string" not implemented') - def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: @@ -164,7 +158,7 @@ class ImagCDFFactory(TimeseriesFactory): "Rec_Vary": True, "Var_Type": "zVariable", "Dim_Sizes": [], - "Sparse": "no_sparse", # no_sparse because there should not be time gaps. + "Sparse": "no_sparse", # no_sparse because there should not be time gaps. (Time stamps must represent a regular time series with no missing values in the series) "Compress": 9, "Pad": None, } @@ -178,11 +172,10 @@ class ImagCDFFactory(TimeseriesFactory): temperature_index = 0 for trace in timeseries: channel = trace.stats.channel - if channel in TEMPERATURE_ELEMENTS_ID: - temperature_index += 1 # MUST INCREMENT INDEX BEFORE USING - var_name = f"Temperature{temperature_index}" - else: - var_name = f"GeomagneticField{channel}" + var_name = f"GeomagneticField{channel}" + # if channel in REAL_TEMPERATURE: + # temperature_index += 1 # MUST INCREMENT INDEX BEFORE USING + # var_name = f"Temperature{temperature_index}" data_type = self._get_cdf_data_type(trace) num_elements = 1 if data_type in [ @@ -258,8 +251,8 @@ class ImagCDFFactory(TimeseriesFactory): observatory = stats.station starttime = starttime or stats.starttime endtime = endtime or stats.endtime - # Split data into intervals if necessary + urlIntervals = Util.get_intervals( starttime=starttime, endtime=endtime, size=self.urlInterval ) @@ -295,7 +288,6 @@ class ImagCDFFactory(TimeseriesFactory): starttime=interval_start, endtime=interval_end, ) - # Check if the file already exists to merge data if os.path.isfile(url_file): try: @@ -325,13 +317,13 @@ class ImagCDFFactory(TimeseriesFactory): ) # Proceed with new data - # Pad the data with NaNs to ensure it fits the interval + # Pad the data to ensure it fits the interval url_data.trim( starttime=interval_start, endtime=interval_end, nearest_sample=False, pad=True, - fill_value=np.nan, + fill_value=99_999, # FILLVAL ) # Write the data to the CDF file @@ -539,6 +531,7 @@ class ImagCDFFactory(TimeseriesFactory): def _create_time_stamp_variables(self, timeseries: Stream) -> dict: vector_times = None scalar_times = None + nonstandard_times = None temperature_times = {} temperature_index = 1 @@ -571,14 +564,21 @@ class ImagCDFFactory(TimeseriesFactory): raise ValueError( "Time stamps for scalar channels are not the same." ) - elif channel in TEMPERATURE_ELEMENTS_ID: - ts_key = f"Temperature{temperature_index}Times" - if ts_key not in temperature_times: - temperature_times[ts_key] = tt2000_times - temperature_index += 1 + # elif channel in REAL_TEMPERATURES: + # ts_key = f"Temperature{temperature_index}Times" + # if ts_key not in temperature_times: + # temperature_times[ts_key] = tt2000_times + # temperature_index += 1 + # else: + # temperature_times[ts_key] = tt2000_times + elif channel in self.NONSTANDARD_ELEMENTS: + if scalar_times is None: + nonstandard_times = tt2000_times else: - temperature_times[ts_key] = tt2000_times - + if not np.array_equal(scalar_times, tt2000_times): + raise ValueError( + "Time stamps for nonstandard channels are not the same." + ) time_vars = {} if vector_times is not None: time_vars["GeomagneticVectorTimes"] = vector_times @@ -587,16 +587,28 @@ class ImagCDFFactory(TimeseriesFactory): if temperature_times: time_vars.update(temperature_times) + isNonStandard = nonstandard_times is not None + if isNonStandard: + time_vars["NonstandardTimes"] = nonstandard_times + last_times = [] + self.isUniqueTimes = ( len(time_vars) == 1 - ) # true if only one set of times, else default to false. + ) # (true if is a single time stamp variable in the file) for index, times in enumerate(time_vars.values()): if index > 0: self.isUniqueTimes = not np.array_equal(last_times, times) last_times = times - - return time_vars if self.isUniqueTimes else {"DataTimes": last_times} + if self.isUniqueTimes and isNonStandard and len(time_vars) > 2: + raise ValueError( + f"Time stamps must be the same for all channels when using nonstandard channels: {','.join(self.NONSTANDARD_ELEMENTS)}" + ) + return ( + {"DataTimes": last_times} + if isNonStandard or not self.isUniqueTimes + else time_vars + ) def _create_var_spec( self, @@ -660,9 +672,13 @@ class ImagCDFFactory(TimeseriesFactory): validmax = 90.0 # The magnetic field vector can point straight down (+90°), horizontal (0°), or straight up (-90°). elif channel in TEMPERATURE_ELEMENTS_ID: units = "Celsius" - fieldnam = f"Temperature {temperature_index} {trace.stats.location}" validmin = -273.15 # absolute zero validmax = 79_999 + # elif channel in [REAL_TEMPERATURES]: + # units = "Celsius" + # fieldnam = f"Temperature {temperature_index} {trace.stats.location}" + # validmin = -273.15 # absolute zero + # validmax = 79_999 elif channel in ["F", "S"]: units = "nT" validmin = ( @@ -675,12 +691,14 @@ class ImagCDFFactory(TimeseriesFactory): validmax = 79_999.0 # Determine DEPEND_0 based on channel type - if channel in self._get_vector_elements(): + if not isUniqueTimes: + depend_0 = "DataTimes" + elif channel in self._get_vector_elements(): depend_0 = "GeomagneticVectorTimes" elif channel in self._get_scalar_elements(): depend_0 = "GeomagneticScalarTimes" - elif channel in TEMPERATURE_ELEMENTS_ID: - depend_0 = f"Temperature{temperature_index}Times" + # elif channel in REAL_TEMPERATURES: + # depend_0 = f"Temperature{temperature_index}Times" var_attrs = { "FIELDNAM": fieldnam, @@ -688,10 +706,10 @@ class ImagCDFFactory(TimeseriesFactory): "FILLVAL": 99999.0, "VALIDMIN": validmin, "VALIDMAX": validmax, - "DEPEND_0": depend_0 if isUniqueTimes else "DataTimes", + "DEPEND_0": depend_0, "DISPLAY_TYPE": "time_series", "LABLAXIS": channel, - "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, + "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, # optional } return var_attrs @@ -953,10 +971,11 @@ class ImagCDFFactory(TimeseriesFactory): # If the urlTemplate doesn't support placeholders, assume 'file://' scheme if self.urlTemplate.startswith("file://"): - base_path = self.urlTemplate[7:] # Strip "file://" + base_path = self.urlTemplate[7:] if not base_path or base_path == "{obs}_{dt}_{t}.cdf": base_path = os.getcwd() # Default to current working directory - return os.path.join(base_path, filename) + return os.path.join(base_path, filename) + return os.path.join(self.urlTemplate, filename) # Unsupported URL scheme raise TimeseriesFactoryException( -- GitLab From 066e59b4eca583d786acb5d488324d1a6a5e52ce Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 13 Dec 2024 09:13:41 -0800 Subject: [PATCH 13/30] Nonstand elements implemented. Temperatures un-implemented. no longer extends streamfactory on imagcdf input, parse string removed. basic is valid imagcdf logic. --- geomagio/imagcdf/ImagCDFFactory.py | 256 +++++++++++++++++------------ 1 file changed, 148 insertions(+), 108 deletions(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index a7838f14..0c63bb70 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -101,7 +101,7 @@ class ImagCDFFactory(TimeseriesFactory): channels: List[str] = ("H", "D", "Z", "F"), type: DataType = "variation", interval: DataInterval = "minute", - urlTemplate="file://{obs}_{dt}_{t}.cdf", + urlTemplate="file://etc/imagcdf/{obs}_{dt}_{t}.cdf", urlInterval: int = -1, ): """ @@ -134,7 +134,7 @@ class ImagCDFFactory(TimeseriesFactory): cdf_spec = { "Compressed": 9, # Enable compression (0-9) "Majority": CDFWriter.ROW_MAJOR, # Data layout - gets set automatically - "Encoding": CDFWriter.IBMPC_ENCODING, # gets set automatically + "Encoding": CDFWriter.HOST_ENCODING, # gets set automatically "Checksum": True, # Disable checksum for faster writes (optional) "rDim_sizes": [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } @@ -259,8 +259,9 @@ class ImagCDFFactory(TimeseriesFactory): for urlInterval in urlIntervals: interval_start = urlInterval["start"] interval_end = urlInterval["end"] - if interval_start != interval_end: - interval_end = interval_end - delta + # Removes last data point ex: if endtime = 02:00:00, this could return 01:59:00 as last data point. + # if interval_start != interval_end: + # interval_end = interval_end - delta url = self._get_url( observatory=observatory, date=interval_start, @@ -290,38 +291,16 @@ class ImagCDFFactory(TimeseriesFactory): ) # Check if the file already exists to merge data if os.path.isfile(url_file): - try: - # Read existing data to merge with new data - existing_cdf = CDFReader(url_file) - existing_stream = self._read_cdf(existing_cdf) - # existing_cdf.close() #no close method? - existing_data = existing_stream - - # Merge existing data with new data - for trace in existing_data: - new_trace = url_data.select( - network=trace.stats.network, - station=trace.stats.station, - channel=trace.stats.channel, - ) - if new_trace: - trace.data = np.concatenate( - (trace.data, new_trace[0].data) - ) - url_data = existing_data + url_data - except Exception as e: - # Log the exception if needed - print( - f"Warning: Could not read existing CDF file '{url_file}': {e}", - file=sys.stderr, - ) - # Proceed with new data - + os.remove(url_file) + print(f"Naming conflict. Deleting exisiting file '{url_file}'.") + # raise TimeseriesFactoryException( + # f"Error: File '{url_file}' already exists." + # ) # Pad the data to ensure it fits the interval url_data.trim( starttime=interval_start, endtime=interval_end, - nearest_sample=False, + nearest_sample=True, pad=True, fill_value=99_999, # FILLVAL ) @@ -370,7 +349,6 @@ class ImagCDFFactory(TimeseriesFactory): raise TimeseriesFactoryException( "Only file urls are supported for reading ImagCDF" ) - url_file = Util.get_file_from_url(url, createParentDirectory=False) if not os.path.isfile(url_file): # If file doesn't exist, skip @@ -379,23 +357,24 @@ class ImagCDFFactory(TimeseriesFactory): try: # Read CDF data and merge cdf = CDFReader(url_file) - file_stream = self._read_cdf(cdf) - # Attempt to select only requested channels - selected = Stream() - for ch in channels: - selected += file_stream.select(channel=ch) - timeseries += selected + # file_stream = self._read_cdf(cdf, channels) + timeseries = self._read_cdf(cdf, channels) + # Attempt to select only requested channelws (redundant as read_cdf can more efficiently filter) + # selected = Stream() + # for ch in channels: + # selected += file_stream.select(channel=ch) + # timeseries += selected except Exception as e: print(f"Error reading CDF file '{url_file}': {e}", file=sys.stderr) # After reading all intervals, merge and trim - timeseries.merge(fill_value=np.nan) + timeseries.merge(fill_value=99_999) timeseries.trim( starttime=starttime, endtime=endtime, - nearest_sample=False, + nearest_sample=True, pad=True, - fill_value=np.nan, + fill_value=99_999, ) # If requested, add empty channels not present in data @@ -416,46 +395,47 @@ class ImagCDFFactory(TimeseriesFactory): timeseries.sort() return timeseries - def parse_string(self, data: str, **kwargs): - """ - Parse ImagCDF binary data into an ObsPy Stream. - - This method writes the provided binary data to a temporary file, - reads the file using `cdflib`, and converts the data into an ObsPy - Stream. - - Parameters - ---------- - data : bytes - Binary data containing ImagCDF content. - - Returns - ------- - Stream - An ObsPy Stream object with the parsed geomagnetic time series data. - - Raises - ------ - TimeseriesFactoryException - If an error occurs while parsing the ImagCDF data. - """ - # Create a temporary file to store the CDF data - with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: - tmp_file_name = tmp_file.name - tmp_file.write(data) - - try: - # Read the CDF from the temporary file - cdf = CDFReader(tmp_file_name) - stream = self._read_cdf(cdf) - # no cdf.close() method required - except Exception as e: - raise TimeseriesFactoryException(f"Error parsing ImagCDF data: {e}") - finally: - # Clean up the temporary file - os.remove(tmp_file_name) - - return stream + # Removed - cdflib takes a file path as an input more efficiently than taking in byte data. + # def parse_string(self, data: str, **kwargs): + # """ + # Parse ImagCDF binary data into an ObsPy Stream. + + # This method writes the provided binary data to a temporary file, + # reads the file using `cdflib`, and converts the data into an ObsPy + # Stream. + + # Parameters + # ---------- + # data : bytes + # Binary data containing ImagCDF content. + + # Returns + # ------- + # Stream + # An ObsPy Stream object with the parsed geomagnetic time series data. + + # Raises + # ------ + # TimeseriesFactoryException + # If an error occurs while parsing the ImagCDF data. + # """ + # # Create a temporary file to store the CDF data + # with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: + # tmp_file_name = tmp_file.name + # tmp_file.write(data) + # channels = kwargs.get('channels', []) + # try: + # # Read the CDF from the temporary file + # cdf = CDFReader(tmp_file_name) + # stream = self._read_cdf(cdf, channels) + # # no cdf.close() method required + # except Exception as e: + # raise TimeseriesFactoryException(f"Error parsing ImagCDF data: {e}") + # finally: + # # Clean up the temporary file + # os.remove(tmp_file_name) + + # return stream def _create_global_attributes( self, timeseries: Stream, channels: List[str] @@ -674,6 +654,7 @@ class ImagCDFFactory(TimeseriesFactory): units = "Celsius" validmin = -273.15 # absolute zero validmax = 79_999 + depend_0 = "DataTimes" #can be used for nonstandard element # elif channel in [REAL_TEMPERATURES]: # units = "Celsius" # fieldnam = f"Temperature {temperature_index} {trace.stats.location}" @@ -753,7 +734,7 @@ class ImagCDFFactory(TimeseriesFactory): # Default to double precision float return CDFWriter.CDF_DOUBLE - def _read_cdf(self, cdf: CDFReader) -> Stream: + def _read_cdf(self, cdf: CDFReader, channels: Optional[List[str]]) -> Stream: """ Read CDF data into an ObsPy Stream. @@ -771,6 +752,37 @@ class ImagCDFFactory(TimeseriesFactory): # Extract global attributes global_attrs = cdf.globalattsget() + # Define required global attributes based on ImagCDF standards + required_global_attrs = [ + "FormatDescription", + "FormatVersion", + "Title", + "IagaCode", + "ElementsRecorded", + "PublicationLevel", + "PublicationDate", + "ObservatoryName", + "Latitude", + "Longitude", + "Elevation", + "Institution", + "StandardLevel", + "Source", + ] + + # Check for missing required global attributes per ImagCDF techincal document + missing_global_attrs = [] + for attr in required_global_attrs: + if attr not in global_attrs: + missing_global_attrs.append(attr) + + if missing_global_attrs: + error_message = ( + f"The following required global attributes are missing from the CDF: " + f"{', '.join(missing_global_attrs)}" + ) + raise TimeseriesFactoryException(error_message) + # Map global attributes to Stream-level metadata observatory = global_attrs.get("IagaCode", [""])[0] station_name = global_attrs.get("ObservatoryName", [""])[0] @@ -791,7 +803,7 @@ class ImagCDFFactory(TimeseriesFactory): # Identify all time variables time_vars = {} for var in cdf.cdf_info().zVariables: - if var.lower().endswith("times"): + if var.endswith("Times"): time_data = cdf.varget(var) unix_times = cdflib.cdfepoch.unixtime(time_data) utc_times = [UTCDateTime(t) for t in unix_times] @@ -799,18 +811,34 @@ class ImagCDFFactory(TimeseriesFactory): # Read data variables and associate them with time variables for var in cdf.cdf_info().zVariables: + # Skip time variables - if var.lower().endswith("times"): + if var.endswith("Times"): continue + # Map the variable name back to a standard channel code + # Geomagnetic fields are named like GeomagneticFieldH, GeomagneticFieldD, etc. + # Temperatures are named like Temperature1, Temperature2, ... + # Extract channel name by removing known prefixes + if var.startswith("GeomagneticField"): + channel = var.replace("GeomagneticField", "") + elif var.startswith("Temperature"): + # Temperature variables may not map directly to a geomagnetic channel + # but to temperature sensors. We can just use the label from LABLAXIS if needed + channel = attrs.get("LABLAXIS", var) + else: + # fallback if naming doesn't match expected patterns + channel = var + + if channels and channel not in channels: continue data = cdf.varget(var) attrs = cdf.varattsget(var) # Determine DEPEND_0 (the time variable name) and validate ts_name = attrs.get("DEPEND_0") - if not ts_name: - # If no DEPEND_0, skip this variable as we cannot map times - continue + # if not ts_name: + # # If no DEPEND_0, skip this variable as we cannot map times + # continue # The ImagCDF can have DataTimes, GeomagneticVectorTimes, GeomagneticScalarTimes, TemperatureNTimes (for N > 0), etc. matched_time_key = None @@ -819,11 +847,12 @@ class ImagCDFFactory(TimeseriesFactory): matched_time_key = tkey break - if matched_time_key not in time_vars: - # If we cannot find the matching time variable, skip this variable - continue - - times = time_vars[matched_time_key] + # if matched_time_key not in time_vars: + # # If we cannot find the matching time variable, skip this variable + # continue + times = [] + if matched_time_key in time_vars: + times = time_vars[matched_time_key] # Determine delta (sample interval) if len(times) > 1: @@ -842,22 +871,33 @@ class ImagCDFFactory(TimeseriesFactory): # fallback, set delta to 60 delta = 60.0 - # Map the variable name back to a standard channel code - # Geomagnetic fields are named like GeomagneticFieldH, GeomagneticFieldD, etc. - # Temperatures are named like Temperature1, Temperature2, ... - # Extract channel name by removing known prefixes - if var.startswith("GeomagneticField"): - channel = var.replace("GeomagneticField", "") - elif var.startswith("Temperature"): - # Temperature variables may not map directly to a geomagnetic channel - # but to temperature sensors. We can just use the label from LABLAXIS if needed - channel = attrs.get("LABLAXIS", var) - else: - # fallback if naming doesn't match expected patterns - channel = var - time_attrs = cdf.varattsget(var) data_interval = time_attrs.get("DATA_INTERVAL_TYPE", [""]) + + # Required variable attributes based on ImagCDF standards + required_variable_attrs = [ + "FIELDNAM", + "UNITS", + "FILLVAL", + "VALIDMIN", + "VALIDMAX", + "DISPLAY_TYPE", + "LABLAXIS", + "DEPEND_0" + ] + # Validate presence of required variable attributes + missing_var_attrs = [] + for var_attr in required_variable_attrs: + if var_attr not in attrs: + missing_var_attrs.append(var_attr) + + if missing_var_attrs or missing_global_attrs: + error_message = ( + f"The variable '{var}' is missing required attributes: " + f"{', '.join(missing_var_attrs)}" + ) + raise TimeseriesFactoryException(error_message) + # Create a trace trace = Trace( data=data, @@ -974,7 +1014,7 @@ class ImagCDFFactory(TimeseriesFactory): base_path = self.urlTemplate[7:] if not base_path or base_path == "{obs}_{dt}_{t}.cdf": base_path = os.getcwd() # Default to current working directory - return os.path.join(base_path, filename) + return os.path.join(base_path, "etc","imagcdf", filename) return os.path.join(self.urlTemplate, filename) # Unsupported URL scheme -- GitLab From b221ec1db777d993dead9def3dfca3f93b9d596a Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 13 Dec 2024 09:28:18 -0800 Subject: [PATCH 14/30] network encoding for maximum portability. code cleanup --- geomagio/imagcdf/ImagCDFFactory.py | 86 ++++++------------------------ 1 file changed, 17 insertions(+), 69 deletions(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 0c63bb70..89c1f6a6 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -132,9 +132,9 @@ class ImagCDFFactory(TimeseriesFactory): try: # Initialize the CDF writer cdf_spec = { - "Compressed": 9, # Enable compression (0-9) - "Majority": CDFWriter.ROW_MAJOR, # Data layout - gets set automatically - "Encoding": CDFWriter.HOST_ENCODING, # gets set automatically + "Compressed": 9, # Enable compression (1-9) + "Majority": CDFWriter.ROW_MAJOR, + "Encoding": CDFWriter.NETWORK_ENCODING, # XDR Encoding - If a CDF must be portable between two or more different types of computers use network encoded. "Checksum": True, # Disable checksum for faster writes (optional) "rDim_sizes": [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } @@ -259,9 +259,6 @@ class ImagCDFFactory(TimeseriesFactory): for urlInterval in urlIntervals: interval_start = urlInterval["start"] interval_end = urlInterval["end"] - # Removes last data point ex: if endtime = 02:00:00, this could return 01:59:00 as last data point. - # if interval_start != interval_end: - # interval_end = interval_end - delta url = self._get_url( observatory=observatory, date=interval_start, @@ -357,13 +354,7 @@ class ImagCDFFactory(TimeseriesFactory): try: # Read CDF data and merge cdf = CDFReader(url_file) - # file_stream = self._read_cdf(cdf, channels) timeseries = self._read_cdf(cdf, channels) - # Attempt to select only requested channelws (redundant as read_cdf can more efficiently filter) - # selected = Stream() - # for ch in channels: - # selected += file_stream.select(channel=ch) - # timeseries += selected except Exception as e: print(f"Error reading CDF file '{url_file}': {e}", file=sys.stderr) @@ -395,48 +386,6 @@ class ImagCDFFactory(TimeseriesFactory): timeseries.sort() return timeseries - # Removed - cdflib takes a file path as an input more efficiently than taking in byte data. - # def parse_string(self, data: str, **kwargs): - # """ - # Parse ImagCDF binary data into an ObsPy Stream. - - # This method writes the provided binary data to a temporary file, - # reads the file using `cdflib`, and converts the data into an ObsPy - # Stream. - - # Parameters - # ---------- - # data : bytes - # Binary data containing ImagCDF content. - - # Returns - # ------- - # Stream - # An ObsPy Stream object with the parsed geomagnetic time series data. - - # Raises - # ------ - # TimeseriesFactoryException - # If an error occurs while parsing the ImagCDF data. - # """ - # # Create a temporary file to store the CDF data - # with tempfile.NamedTemporaryFile(delete=False, suffix=".cdf") as tmp_file: - # tmp_file_name = tmp_file.name - # tmp_file.write(data) - # channels = kwargs.get('channels', []) - # try: - # # Read the CDF from the temporary file - # cdf = CDFReader(tmp_file_name) - # stream = self._read_cdf(cdf, channels) - # # no cdf.close() method required - # except Exception as e: - # raise TimeseriesFactoryException(f"Error parsing ImagCDF data: {e}") - # finally: - # # Clean up the temporary file - # os.remove(tmp_file_name) - - # return stream - def _create_global_attributes( self, timeseries: Stream, channels: List[str] ) -> dict: @@ -654,7 +603,7 @@ class ImagCDFFactory(TimeseriesFactory): units = "Celsius" validmin = -273.15 # absolute zero validmax = 79_999 - depend_0 = "DataTimes" #can be used for nonstandard element + depend_0 = "DataTimes" # can be used for nonstandard element # elif channel in [REAL_TEMPERATURES]: # units = "Celsius" # fieldnam = f"Temperature {temperature_index} {trace.stats.location}" @@ -782,7 +731,7 @@ class ImagCDFFactory(TimeseriesFactory): f"{', '.join(missing_global_attrs)}" ) raise TimeseriesFactoryException(error_message) - + # Map global attributes to Stream-level metadata observatory = global_attrs.get("IagaCode", [""])[0] station_name = global_attrs.get("ObservatoryName", [""])[0] @@ -811,26 +760,25 @@ class ImagCDFFactory(TimeseriesFactory): # Read data variables and associate them with time variables for var in cdf.cdf_info().zVariables: - + # Skip time variables if var.endswith("Times"): continue - # Map the variable name back to a standard channel code - # Geomagnetic fields are named like GeomagneticFieldH, GeomagneticFieldD, etc. - # Temperatures are named like Temperature1, Temperature2, ... - # Extract channel name by removing known prefixes + # Map the variable name back to a standard channel code by removing known prefixes + # Names are like GeomagneticFieldH, GeomagneticFieldD, Temperature1, Temperature2, ... if var.startswith("GeomagneticField"): channel = var.replace("GeomagneticField", "") - elif var.startswith("Temperature"): - # Temperature variables may not map directly to a geomagnetic channel - # but to temperature sensors. We can just use the label from LABLAXIS if needed - channel = attrs.get("LABLAXIS", var) + # elif var.startswith("Temperature"): + # # Temperature variables may not map directly to a geomagnetic channel + # # but to temperature sensors. We can just use the label from LABLAXIS if needed + # channel = attrs.get("LABLAXIS", var) else: # fallback if naming doesn't match expected patterns channel = var - if channels and channel not in channels: continue + if channels and channel not in channels: + continue data = cdf.varget(var) attrs = cdf.varattsget(var) @@ -852,7 +800,7 @@ class ImagCDFFactory(TimeseriesFactory): # continue times = [] if matched_time_key in time_vars: - times = time_vars[matched_time_key] + times = time_vars[matched_time_key] # Determine delta (sample interval) if len(times) > 1: @@ -883,7 +831,7 @@ class ImagCDFFactory(TimeseriesFactory): "VALIDMAX", "DISPLAY_TYPE", "LABLAXIS", - "DEPEND_0" + "DEPEND_0", ] # Validate presence of required variable attributes missing_var_attrs = [] @@ -1014,7 +962,7 @@ class ImagCDFFactory(TimeseriesFactory): base_path = self.urlTemplate[7:] if not base_path or base_path == "{obs}_{dt}_{t}.cdf": base_path = os.getcwd() # Default to current working directory - return os.path.join(base_path, "etc","imagcdf", filename) + return os.path.join(base_path, "etc", "imagcdf", filename) return os.path.join(self.urlTemplate, filename) # Unsupported URL scheme -- GitLab From ba081cc4dffa98e52d6d076e56071ecf1e853844 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 13 Dec 2024 12:43:14 -0800 Subject: [PATCH 15/30] accept and use file path is provided via --input-file argument, else try urlTemplates path. Network as a custom global attribute --- geomagio/imagcdf/ImagCDFFactory.py | 58 ++++++++++++++++++------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 89c1f6a6..c2855045 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -103,6 +103,7 @@ class ImagCDFFactory(TimeseriesFactory): interval: DataInterval = "minute", urlTemplate="file://etc/imagcdf/{obs}_{dt}_{t}.cdf", urlInterval: int = -1, + inputFile: Optional[str] = None, ): """ Initialize the ImagCDFFactory with default parameters. @@ -114,6 +115,7 @@ class ImagCDFFactory(TimeseriesFactory): - interval: Data interval (e.g., 'minute', 'second'). - urlTemplate: Template for generating file URLs or paths. - urlInterval: Interval size for splitting data into multiple files. + - inputFile: An ImagCDF file to read data from. If not provided urlTemplate is assumed path for reads. """ super().__init__( observatory=observatory, @@ -123,6 +125,7 @@ class ImagCDFFactory(TimeseriesFactory): urlTemplate=urlTemplate, urlInterval=urlInterval, ) + self.inputFile = inputFile def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data @@ -333,24 +336,26 @@ class ImagCDFFactory(TimeseriesFactory): ) for urlInterval in urlIntervals: - url = self._get_url( - observatory=observatory, - date=urlInterval["start"], - type=type, - interval=interval, - channels=channels, - ) - if url == "stdout": - continue # stdout is not a valid input source - if not url.startswith("file://"): - raise TimeseriesFactoryException( - "Only file urls are supported for reading ImagCDF" + if self.inputFile is None: + url = self._get_url( + observatory=observatory, + date=urlInterval["start"], + type=type, + interval=interval, + channels=channels, ) - url_file = Util.get_file_from_url(url, createParentDirectory=False) - if not os.path.isfile(url_file): - # If file doesn't exist, skip - continue - + if url == "stdout": + continue # stdout is not a valid input source + if not url.startswith("file://"): + raise TimeseriesFactoryException( + "Only file urls are supported for reading ImagCDF" + ) + url_file = Util.get_file_from_url(url, createParentDirectory=False) + if not os.path.isfile(url_file): + # If file doesn't exist, skip + continue + else: + url_file = self.inputFile try: # Read CDF data and merge cdf = CDFReader(url_file) @@ -406,6 +411,7 @@ class ImagCDFFactory(TimeseriesFactory): getattr(stats, "station_name", None) or self.observatory or "" ) station = getattr(stats, "station", None) or "" + network = getattr(stats, "network", None) or "" institution = getattr(stats, "agency_name", None) or "" latitude = getattr(stats, "geodetic_latitude", None) or 0.0 longitude = getattr(stats, "geodetic_longitude", None) or 0.0 @@ -450,10 +456,12 @@ class ImagCDFFactory(TimeseriesFactory): # 'UniqueIdentifier': {0: ''}, # 'ParentIdentifiers': {0: ''}, # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov - "SensorSamplingRate": {0: sensor_sampling_rate}, # Optional - "DataType": {0: data_type}, # Optional - "Comments": {0: comments}, # Optional - "DeclinationBase": {0: declination_base}, # Optional + # Custom Attributes Below + "SensorSamplingRate": {0: sensor_sampling_rate}, + "DataType": {0: data_type}, + "Comments": {0: comments}, + "DeclinationBase": {0: declination_base}, + "Network": {0: network}, } return global_attrs @@ -639,7 +647,8 @@ class ImagCDFFactory(TimeseriesFactory): "DEPEND_0": depend_0, "DISPLAY_TYPE": "time_series", "LABLAXIS": channel, - "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, # optional + # custom ImagCDF variable attributes below + "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, } return var_attrs @@ -745,9 +754,10 @@ class ImagCDFFactory(TimeseriesFactory): sensor_orientation = global_attrs.get("VectorSensOrient", [""])[0] data_type = global_attrs.get("DataType", ["variation"])[0] publication_level = global_attrs.get("PublicationLevel", ["1"])[0] - comments = global_attrs.get("Comments", [""]) + comments = global_attrs.get("Comments", [""]) #keep comments as an array terms_of_use = global_attrs.get("TermsOfUse", [""])[0] declination_base = global_attrs.get("DeclinationBase", [0.0])[0] + network = global_attrs.get("Network", [""])[0] # Identify all time variables time_vars = {} @@ -862,10 +872,12 @@ class ImagCDFFactory(TimeseriesFactory): "station_name": station_name, "agency_name": institution, "conditions_of_use": terms_of_use, + # data points not in a traditional ImagCDF "sensor_sampling_rate": sensor_sampling_rate, "data_interval_type": data_interval, "declination_base": declination_base, "filter_comments": comments, + "network": network }, ) stream += trace -- GitLab From 44795c5a7ba7786ab910f8307eef24deb3f67440 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 16 Dec 2024 13:19:28 -0800 Subject: [PATCH 16/30] Better mapping of publication level to imag or imf. Compression and Fill Value constants. Code simplified. --- geomagio/Controller.py | 1 + geomagio/imagcdf/IMCDFPublication.py | 108 +++++++++ geomagio/imagcdf/ImagCDFFactory.py | 313 ++++++++------------------- 3 files changed, 205 insertions(+), 217 deletions(-) create mode 100644 geomagio/imagcdf/IMCDFPublication.py diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 4e212579..826683f1 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -501,6 +501,7 @@ def get_input_factory(args): input_factory_args["observatory"] = args.observatory input_factory_args["type"] = args.type input_type = args.input + input_factory_args["output"] = args.output # stream/url arguments if args.input_file is not None: if input_type in ["netcdf", "miniseed", "imagcdf"]: diff --git a/geomagio/imagcdf/IMCDFPublication.py b/geomagio/imagcdf/IMCDFPublication.py new file mode 100644 index 00000000..36f66be3 --- /dev/null +++ b/geomagio/imagcdf/IMCDFPublication.py @@ -0,0 +1,108 @@ +from typing import Optional, Union + + +class IMCDFPublicationLevel: + """Handles publication levels and mapping between data types and levels. + + The ImagCDF format uses publication levels to describe the processing + level of the data. This class maps data types (e.g., 'variation', 'definitive') + to their corresponding publication levels as defined in the ImagCDF documentation. + + Publication Levels: + 1: Raw data with no processing. + 2: Edited data with preliminary baselines. + 3: Data suitable for initial bulletins or quasi-definitive publication. + 4: Definitive data with no further changes expected. + + Reference: + - ImagCDF Technical Documentation: Attributes that Uniquely Identify the Data + """ + + class PublicationLevel: + LEVEL_1 = "1" + LEVEL_2 = "2" + LEVEL_3 = "3" + LEVEL_4 = "4" + + TYPE_TO_LEVEL = { + "none": PublicationLevel.LEVEL_1, + "variation": PublicationLevel.LEVEL_1, + "reported": PublicationLevel.LEVEL_1, + "provisional": PublicationLevel.LEVEL_2, + "adjusted": PublicationLevel.LEVEL_2, + "quasi-definitive": PublicationLevel.LEVEL_3, + "quasidefinitive": PublicationLevel.LEVEL_3, + "definitive": PublicationLevel.LEVEL_4, + } + + def __init__(self, value: Union[str, int]): + """Initialize with a data type or publication level to determine the publication level. + + Args: + value (Union[str, int]): The data type (str) or publication level (int). + + Raises: + ValueError: If the value is not provided or is unsupported. + """ + if isinstance(value, int) or (isinstance(value, str) and value.isdigit()): + self.level = str(value) + if self.level not in self.PublicationLevel.__dict__.values(): + raise ValueError(f"Unsupported level: {value}") + elif isinstance(value, str): + self.level = self.TYPE_TO_LEVEL.get(value.lower()) + if not self.level: + raise ValueError(f"Unsupported data_type: {value}") + else: + raise ValueError( + "value must be a string or an integer representing data_type or level." + ) + + def get_level(self) -> str: + """Return the publication level as a string. + + Returns: + str: The publication level. + """ + return self.level + + def get_imf_data_type(self, long_form: Optional[bool] = True) -> str: + """Get the IMF data type based on the publication level. + + Args: + long_form (bool): If True, return the full description; otherwise, return the abbreviated form. + + Returns: + str: The IMF data type. + + Reference: + https://tech-man.intermagnet.org/latest/appendices/dataformats.html#intermagnet-satellite-transmission-format-imfv2-83 + """ + if self.level == self.PublicationLevel.LEVEL_4: + return "Definitive" if long_form else "D" + elif self.level == self.PublicationLevel.LEVEL_3: + return "Quasi-definitive" if long_form else "Q" + elif self.level == self.PublicationLevel.LEVEL_2: + return "Adjusted" if long_form else "A" + else: + return "Reported" if long_form else "R" + + def get_iaga2002_data_type(self, long_form: Optional[bool] = True) -> str: + """Get the IAGA-2002 data type based on the publication level. + + Args: + long_form (bool): If True, return the full description; otherwise, return the abbreviated form. + + Returns: + str: The IAGA-2002 data type. + + Reference: + https://tech-man.intermagnet.org/latest/appendices/dataformats.html#iaga2002-intermagnet-exchange-format-spreadsheet-compatible + """ + if self.level == self.PublicationLevel.LEVEL_4: + return "Definitive" if long_form else "D" + elif self.level == self.PublicationLevel.LEVEL_3: + return "Quasi-definitive" if long_form else "Q" + elif self.level == self.PublicationLevel.LEVEL_2: + return "Provisional" if long_form else "P" + else: + return "Variation" if long_form else "V" diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index c2855045..de58ced1 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -12,77 +12,23 @@ References: - CDFLib Docs: [https://cdflib.readthedocs.io/en/latest/#, https://cdflib.readthedocs.io/en/stable/modules/index.html] """ -from __future__ import absolute_import, print_function -from io import BytesIO import os import sys -from typing import List, Optional, Union -from datetime import datetime, timezone, tzinfo +from typing import List, Optional +from datetime import datetime, timezone import numpy as np from obspy import Stream, Trace, UTCDateTime -from sqlalchemy import true from geomagio.TimeseriesFactory import TimeseriesFactory from geomagio.api.ws.Element import TEMPERATURE_ELEMENTS_ID - from ..geomag_types import DataInterval, DataType from ..TimeseriesFactoryException import TimeseriesFactoryException -from .. import TimeseriesUtility from .. import Util - -import cdflib +from cdflib import cdfepoch from cdflib.cdfwrite import CDF as CDFWriter from cdflib.cdfread import CDF as CDFReader import tempfile - - -class IMCDFPublicationLevel: - """Handles publication levels and mapping between data types and levels. - - The ImagCDF format uses publication levels to describe the processing - level of the data. This class maps data types (e.g., 'variation', 'definitive') - to their corresponding publication levels as defined in the ImagCDF documentation. - - Publication Levels: - 1: Raw data with no processing. - 2: Edited data with preliminary baselines. - 3: Data suitable for initial bulletins or quasi-definitive publication. - 4: Definitive data with no further changes expected. - - Reference: - - ImagCDF Technical Documentation: Attributes that Uniquely Identify the Data - """ - - class PublicationLevel: - LEVEL_1 = "1" - LEVEL_2 = "2" - LEVEL_3 = "3" - LEVEL_4 = "4" - - TYPE_TO_LEVEL = { - "none": PublicationLevel.LEVEL_1, - "variation": PublicationLevel.LEVEL_1, - "reported": PublicationLevel.LEVEL_1, - "provisional": PublicationLevel.LEVEL_2, - "adjusted": PublicationLevel.LEVEL_2, - "quasi-definitive": PublicationLevel.LEVEL_3, - "quasidefinitive": PublicationLevel.LEVEL_3, - "definitive": PublicationLevel.LEVEL_4, - } - - def __init__(self, data_type: Optional[str] = None): - """Initialize with a data type to determine the publication level.""" - if data_type: - self.level = self.TYPE_TO_LEVEL.get(data_type.lower()) - else: - raise ValueError("data_type must be provided.") - - if not self.level: - raise ValueError(f"Unsupported data_type: {data_type}") - - def to_string(self) -> str: - """Return the publication level as a string.""" - return self.level +from .IMCDFPublication import IMCDFPublicationLevel class ImagCDFFactory(TimeseriesFactory): @@ -93,7 +39,11 @@ class ImagCDFFactory(TimeseriesFactory): """ isUniqueTimes = True # used to determine depend_0 and CDF Time Variable Name - NONSTANDARD_ELEMENTS = TEMPERATURE_ELEMENTS_ID + NONSTANDARD_ELEMENTS = ( + TEMPERATURE_ELEMENTS_ID # for using non standard elements such as UK1 + ) + __FILL_VALUE = 99_999 + __MAX_COMPRESSION = 9 def __init__( self, @@ -103,7 +53,8 @@ class ImagCDFFactory(TimeseriesFactory): interval: DataInterval = "minute", urlTemplate="file://etc/imagcdf/{obs}_{dt}_{t}.cdf", urlInterval: int = -1, - inputFile: Optional[str] = None, + inputFile: Optional[str] = None, + output: Optional[str] = "iaga2002", ): """ Initialize the ImagCDFFactory with default parameters. @@ -115,7 +66,7 @@ class ImagCDFFactory(TimeseriesFactory): - interval: Data interval (e.g., 'minute', 'second'). - urlTemplate: Template for generating file URLs or paths. - urlInterval: Interval size for splitting data into multiple files. - - inputFile: An ImagCDF file to read data from. If not provided urlTemplate is assumed path for reads. + - inputFile: An ImagCDF file to read data from. If not provided urlTemplate is assumed path for reads. """ super().__init__( observatory=observatory, @@ -126,6 +77,7 @@ class ImagCDFFactory(TimeseriesFactory): urlInterval=urlInterval, ) self.inputFile = inputFile + self.output = output def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data @@ -135,10 +87,10 @@ class ImagCDFFactory(TimeseriesFactory): try: # Initialize the CDF writer cdf_spec = { - "Compressed": 9, # Enable compression (1-9) - "Majority": CDFWriter.ROW_MAJOR, + "Compressed": self.__MAX_COMPRESSION, # Max Gzip compression (1-9). Almost always the GZIP is the best choice for all data. (CDF User Guide p.23 1.4.3 ) + "Majority": CDFWriter.ROW_MAJOR, # The first dimension changes the slowest (CDF User Guide p.45 2.3.15 Majority) "Encoding": CDFWriter.NETWORK_ENCODING, # XDR Encoding - If a CDF must be portable between two or more different types of computers use network encoded. - "Checksum": True, # Disable checksum for faster writes (optional) + "Checksum": True, # True for Data Integrity. False for faster writes (optional) "rDim_sizes": [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } @@ -162,11 +114,13 @@ class ImagCDFFactory(TimeseriesFactory): "Var_Type": "zVariable", "Dim_Sizes": [], "Sparse": "no_sparse", # no_sparse because there should not be time gaps. (Time stamps must represent a regular time series with no missing values in the series) - "Compress": 9, + "Compress": self.__MAX_COMPRESSION, "Pad": None, } # Define time variable attributes - var_attrs = self._create_time_var_attrs(ts_name) + var_attrs = ( + {} + ) # ImagCDF does not require or recommend any 'time' variable attributes. # Write time variable cdf_writer.write_var(var_spec, var_attrs, ts_data) @@ -179,23 +133,20 @@ class ImagCDFFactory(TimeseriesFactory): # if channel in REAL_TEMPERATURE: # temperature_index += 1 # MUST INCREMENT INDEX BEFORE USING # var_name = f"Temperature{temperature_index}" - data_type = self._get_cdf_data_type(trace) - num_elements = 1 - if data_type in [ - CDFWriter.CDF_CHAR, - CDFWriter.CDF_UCHAR, - ]: # Handle string types - num_elements = len(trace.data[0]) if len(trace.data) > 0 else 1 - + data_type = ( + CDFWriter.CDF_INT4 + if trace.data.dtype in [np.int32, np.int64] + else CDFWriter.CDF_DOUBLE + ) var_spec = { "Variable": var_name, "Data_Type": data_type, - "Num_Elements": num_elements, + "Num_Elements": 1, "Rec_Vary": True, "Var_Type": "zVariable", "Dim_Sizes": [], "Sparse": "no_sparse", - "Compress": 9, + "Compress": self.__MAX_COMPRESSION, "Pad": None, } var_attrs = self._create_var_attrs( @@ -250,7 +201,6 @@ class ImagCDFFactory(TimeseriesFactory): # Extract metadata from the first trace stats = timeseries[0].stats - delta = stats.delta # Sample rate observatory = stats.station starttime = starttime or stats.starttime endtime = endtime or stats.endtime @@ -291,18 +241,16 @@ class ImagCDFFactory(TimeseriesFactory): ) # Check if the file already exists to merge data if os.path.isfile(url_file): - os.remove(url_file) - print(f"Naming conflict. Deleting exisiting file '{url_file}'.") - # raise TimeseriesFactoryException( - # f"Error: File '{url_file}' already exists." - # ) + raise TimeseriesFactoryException( + f"Error: File '{url_file}' already exists." + ) # Pad the data to ensure it fits the interval url_data.trim( starttime=interval_start, endtime=interval_end, nearest_sample=True, pad=True, - fill_value=99_999, # FILLVAL + fill_value=self.__FILL_VALUE, # FILLVAL ) # Write the data to the CDF file @@ -336,7 +284,7 @@ class ImagCDFFactory(TimeseriesFactory): ) for urlInterval in urlIntervals: - if self.inputFile is None: + if self.inputFile is None: url = self._get_url( observatory=observatory, date=urlInterval["start"], @@ -354,7 +302,7 @@ class ImagCDFFactory(TimeseriesFactory): if not os.path.isfile(url_file): # If file doesn't exist, skip continue - else: + else: url_file = self.inputFile try: # Read CDF data and merge @@ -364,13 +312,13 @@ class ImagCDFFactory(TimeseriesFactory): print(f"Error reading CDF file '{url_file}': {e}", file=sys.stderr) # After reading all intervals, merge and trim - timeseries.merge(fill_value=99_999) + timeseries.merge(fill_value=self.__FILL_VALUE) timeseries.trim( starttime=starttime, endtime=endtime, nearest_sample=True, pad=True, - fill_value=99_999, + fill_value=self.__FILL_VALUE, ) # If requested, add empty channels not present in data @@ -415,7 +363,7 @@ class ImagCDFFactory(TimeseriesFactory): institution = getattr(stats, "agency_name", None) or "" latitude = getattr(stats, "geodetic_latitude", None) or 0.0 longitude = getattr(stats, "geodetic_longitude", None) or 0.0 - elevation = getattr(stats, "elevation", None) or 99_999.0 + elevation = getattr(stats, "elevation", None) or self.__FILL_VALUE conditions_of_use = getattr(stats, "conditions_of_use", None) or "" vector_orientation = getattr(stats, "sensor_orientation", None) or "" data_interval_type = getattr(stats, "data_interval_type", None) or self.interval @@ -423,7 +371,7 @@ class ImagCDFFactory(TimeseriesFactory): sensor_sampling_rate = getattr(stats, "sensor_sampling_rate", None) or 0.0 comments = getattr(stats, "filter_comments", None) or [""] declination_base = getattr(stats, "declination_base", None) or 0.0 - publication_level = IMCDFPublicationLevel(data_type=self.type).to_string() + publication_level = IMCDFPublicationLevel(data_type=self.type).get_level() global_attrs = { "FormatDescription": {0: "INTERMAGNET CDF Format"}, "FormatVersion": {0: "1.2"}, @@ -433,7 +381,7 @@ class ImagCDFFactory(TimeseriesFactory): "PublicationLevel": {0: publication_level}, "PublicationDate": { 0: [ - cdflib.cdfepoch.timestamp_to_tt2000( + cdfepoch.timestamp_to_tt2000( datetime.timestamp(datetime.now(timezone.utc)) ), "cdf_time_tt2000", @@ -444,24 +392,23 @@ class ImagCDFFactory(TimeseriesFactory): "Longitude": {0: np.array([longitude], dtype=np.float64)}, "Elevation": {0: np.array([elevation], dtype=np.float64)}, "Institution": {0: institution}, - "VectorSensOrient": { - 0: vector_orientation - }, # remove F - because its a calculation, not an element? - "StandardLevel": {0: "None"}, # Set to 'None' - # Temporarily Omit 'StandardName', 'StandardVersion', 'PartialStandDesc' + "VectorSensOrient": {0: vector_orientation}, + "StandardLevel": {0: "None"}, + # Can Omit 'StandardName', 'StandardVersion', 'PartialStandDesc' when StandardLevel=None "Source": { 0: "institute" }, # "institute" - if the named institution provided the data, “INTERMAGNET†- if the data file has been created by INTERMAGNET from another data source, “WDC†- if the World Data Centre has created the file from another data source "TermsOfUse": {0: conditions_of_use}, + # [[Optional Attributes]] # 'UniqueIdentifier': {0: ''}, # 'ParentIdentifiers': {0: ''}, # 'ReferenceLinks': {0: ''}, #links to /ws, plots, USGS.gov - # Custom Attributes Below - "SensorSamplingRate": {0: sensor_sampling_rate}, - "DataType": {0: data_type}, - "Comments": {0: comments}, - "DeclinationBase": {0: declination_base}, - "Network": {0: network}, + # [[Custom Attributes]] + "SensorSamplingRate": {0: sensor_sampling_rate}, + "DataType": {0: data_type}, + "Comments": {0: comments}, + "DeclinationBase": {0: declination_base}, + "Network": {0: network}, } return global_attrs @@ -481,7 +428,7 @@ class ImagCDFFactory(TimeseriesFactory): for i in range(trace.stats.npts) ] # Convert timestamps to TT2000 format required by CDF - tt2000_times = cdflib.cdfepoch.timestamp_to_tt2000( + tt2000_times = cdfepoch.timestamp_to_tt2000( [time.timestamp() for time in times] ) @@ -547,49 +494,6 @@ class ImagCDFFactory(TimeseriesFactory): else time_vars ) - def _create_var_spec( - self, - var_name: str, - data_type: str, - num_elements: int, - var_type: str, - dim_sizes: List[int], - sparse: bool, - compress: int, - pad: Optional[Union[str, np.ndarray]], - ) -> dict: - """ - Create a variable specification dictionary for cdflib. - - This is used to define the properties of a variable when writing it to - the CDF file. - - Parameters: - - var_name: Name of the variable. - - data_type: CDF data type. - - num_elements: Number of elements per record. - - var_type: Variable type ('zVariable' or 'rVariable'). - - dim_sizes: Dimensions of the variable (empty list for 0D). - - sparse: Whether the variable uses sparse records. - - compress: Compression level. - - pad: Pad value for sparse records. - - Reference: - - CDF User's Guide: Variable Specification - """ - var_spec = { - "Variable": var_name, - "Data_Type": data_type, - "Num_Elements": num_elements, - "Rec_Vary": True, - "Var_Type": var_type, - "Dim_Sizes": dim_sizes, - "Sparse": "no_sparse" if not sparse else "pad_sparse", - "Compress": compress, - "Pad": pad, - } - return var_spec - def _create_var_attrs( self, trace: Trace, @@ -641,57 +545,17 @@ class ImagCDFFactory(TimeseriesFactory): var_attrs = { "FIELDNAM": fieldnam, "UNITS": units, - "FILLVAL": 99999.0, + "FILLVAL": self.__FILL_VALUE, "VALIDMIN": validmin, "VALIDMAX": validmax, "DEPEND_0": depend_0, "DISPLAY_TYPE": "time_series", "LABLAXIS": channel, - # custom ImagCDF variable attributes below + # [[Custom Attributes]] "DATA_INTERVAL_TYPE": trace.stats.data_interval_type, } return var_attrs - def _create_time_var_attrs(self, ts_name: str) -> dict: - """ - Create a dictionary of time variable attributes. - - These attributes provide metadata for time variables. - Note: None of these attributes are required for the time stamp variables. - Reference: - - ImagCDF Technical Documentation: ImagCDF Data - """ - # var_attrs = { - # 'UNITS': 'TT2000', - # 'DISPLAY_TYPE': 'time_series', - # 'LABLAXIS': 'Time', - # } - # return var_attrs - return {} - - def _get_cdf_data_type(self, trace: Trace) -> int: - """ - Map ObsPy trace data type to CDF data type. - - Determines the appropriate CDF data type based on the NumPy data type - of the trace data. - - Returns: - - CDF_DOUBLE (45) for floating-point data. - - CDF_INT4 (41) for integer data. - - Reference: - - See CDF for more data types - """ - - if trace.data.dtype in [np.float32, np.float64]: - return CDFWriter.CDF_DOUBLE - elif trace.data.dtype in [np.int32, np.int64]: - return CDFWriter.CDF_INT4 - else: - # Default to double precision float - return CDFWriter.CDF_DOUBLE - def _read_cdf(self, cdf: CDFReader, channels: Optional[List[str]]) -> Stream: """ Read CDF data into an ObsPy Stream. @@ -747,24 +611,32 @@ class ImagCDFFactory(TimeseriesFactory): institution = global_attrs.get("Institution", [""])[0] latitude = global_attrs.get("Latitude", [0.0])[0] longitude = global_attrs.get("Longitude", [0.0])[0] - elevation = global_attrs.get("Elevation", [99_999.0])[ + elevation = global_attrs.get("Elevation", [self.__FILL_VALUE])[ 0 ] # default to 99_999 per technical documents. - sensor_sampling_rate = global_attrs.get("SensorSamplingRate", [0.0])[0] sensor_orientation = global_attrs.get("VectorSensOrient", [""])[0] - data_type = global_attrs.get("DataType", ["variation"])[0] - publication_level = global_attrs.get("PublicationLevel", ["1"])[0] - comments = global_attrs.get("Comments", [""]) #keep comments as an array + publication_level = global_attrs.get("PublicationLevel", ["0"])[0] + publication = IMCDFPublicationLevel(publication_level) terms_of_use = global_attrs.get("TermsOfUse", [""])[0] - declination_base = global_attrs.get("DeclinationBase", [0.0])[0] - network = global_attrs.get("Network", [""])[0] + + # Not expected in ImagCDF + data_type = ( + publication.get_imf_data_type() + if "imf" in self.output + else publication.get_iaga2002_data_type() + ) + + sensor_sampling_rate = global_attrs.get("SensorSamplingRate", [None])[0] + comments = global_attrs.get("Comments", [None]) # keep comments as an array + declination_base = global_attrs.get("DeclinationBase", [None])[0] + network = global_attrs.get("Network", [None])[0] # Identify all time variables time_vars = {} for var in cdf.cdf_info().zVariables: if var.endswith("Times"): time_data = cdf.varget(var) - unix_times = cdflib.cdfepoch.unixtime(time_data) + unix_times = cdfepoch.unixtime(time_data) utc_times = [UTCDateTime(t) for t in unix_times] time_vars[var] = utc_times @@ -830,7 +702,7 @@ class ImagCDFFactory(TimeseriesFactory): delta = 60.0 time_attrs = cdf.varattsget(var) - data_interval = time_attrs.get("DATA_INTERVAL_TYPE", [""]) + data_interval = time_attrs.get("DATA_INTERVAL_TYPE", None) # Required variable attributes based on ImagCDF standards required_variable_attrs = [ @@ -856,29 +728,36 @@ class ImagCDFFactory(TimeseriesFactory): ) raise TimeseriesFactoryException(error_message) + header = { + "station": observatory, + "channel": channel, + "starttime": times[0], + "delta": delta, + "geodetic_latitude": latitude, + "geodetic_longitude": longitude, + "elevation": elevation, + "sensor_orientation": "".join(sensor_orientation), + "data_type": data_type, + "station_name": station_name, + "agency_name": institution, + "conditions_of_use": terms_of_use, + } + # data points not expected in ImagCDF specs + if sensor_sampling_rate is not None: + header.update({"sensor_sampling_rate": sensor_sampling_rate}) + if data_interval is not None: + header.update({"data_interval_type": data_interval}) + if declination_base is not None: + header.update({"declination_base": declination_base}) + if comments is not None: + header.update({"filter_comments": comments}) + if network is not None: + header.update({"network": network}) + # Create a trace trace = Trace( data=data, - header={ - "station": observatory, - "channel": channel, - "starttime": times[0], - "delta": delta, - "geodetic_latitude": latitude, - "geodetic_longitude": longitude, - "elevation": elevation, - "sensor_orientation": "".join(sensor_orientation), - "data_type": data_type, - "station_name": station_name, - "agency_name": institution, - "conditions_of_use": terms_of_use, - # data points not in a traditional ImagCDF - "sensor_sampling_rate": sensor_sampling_rate, - "data_interval_type": data_interval, - "declination_base": declination_base, - "filter_comments": comments, - "network": network - }, + header=header, ) stream += trace @@ -915,7 +794,7 @@ class ImagCDFFactory(TimeseriesFactory): - ImagCDF Technical Documentation: ImagCDF File Names """ # Get the publication level for the type - publication_level = IMCDFPublicationLevel(data_type=type).to_string() + publication_level = IMCDFPublicationLevel(data_type=type).get_level() # Format of Date/Time Portion of Filename based on interval see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#example-data-file:~:text=Format%20of%20Date,%EF%83%81 if interval == "year": -- GitLab From 3af41f5ce83a46b629d69c1f1d0fb1b0d25254d9 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Mon, 16 Dec 2024 15:21:29 -0800 Subject: [PATCH 17/30] final tweaks --- geomagio/Controller.py | 1 + geomagio/imagcdf/ImagCDFFactory.py | 32 +++++++++++------------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 826683f1..199befdd 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -573,6 +573,7 @@ def get_input_factory(args): elif input_type == "covjson": input_factory = covjson.CovJSONFactory(**input_factory_args) elif input_type == "imagcdf": + input_factory_args["output"] = args.output input_factory = ImagCDFFactory(**input_factory_args) # wrap stream if input_stream is not None: diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index de58ced1..f25d0c27 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -77,7 +77,9 @@ class ImagCDFFactory(TimeseriesFactory): urlInterval=urlInterval, ) self.inputFile = inputFile - self.output = output + self.output = ( + output if output in ["iaga2002", "imfjson"] else "iaga2002" + ) # determines mapping of publication level to data type def write_file(self, fh, timeseries: Stream, channels: List[str]): # Create a temporary file to write the CDF data @@ -87,9 +89,9 @@ class ImagCDFFactory(TimeseriesFactory): try: # Initialize the CDF writer cdf_spec = { - "Compressed": self.__MAX_COMPRESSION, # Max Gzip compression (1-9). Almost always the GZIP is the best choice for all data. (CDF User Guide p.23 1.4.3 ) + "Compressed": self.__MAX_COMPRESSION, # Max Gzip compression (1-9). Almost always the GZIP is the best choice for all data. (CDF User Guide p.23 1.4.3) "Majority": CDFWriter.ROW_MAJOR, # The first dimension changes the slowest (CDF User Guide p.45 2.3.15 Majority) - "Encoding": CDFWriter.NETWORK_ENCODING, # XDR Encoding - If a CDF must be portable between two or more different types of computers use network encoded. + "Encoding": CDFWriter.NETWORK_ENCODING, # XDR Encoding -Portable between two or more different types of computers. "Checksum": True, # True for Data Integrity. False for faster writes (optional) "rDim_sizes": [], # Applicable only if using rVariables - CDF protocol recommends only using zVariables. } @@ -239,7 +241,7 @@ class ImagCDFFactory(TimeseriesFactory): starttime=interval_start, endtime=interval_end, ) - # Check if the file already exists to merge data + # Check if the file already exists if os.path.isfile(url_file): raise TimeseriesFactoryException( f"Error: File '{url_file}' already exists." @@ -250,7 +252,7 @@ class ImagCDFFactory(TimeseriesFactory): endtime=interval_end, nearest_sample=True, pad=True, - fill_value=self.__FILL_VALUE, # FILLVAL + fill_value=self.__FILL_VALUE, ) # Write the data to the CDF file @@ -366,12 +368,11 @@ class ImagCDFFactory(TimeseriesFactory): elevation = getattr(stats, "elevation", None) or self.__FILL_VALUE conditions_of_use = getattr(stats, "conditions_of_use", None) or "" vector_orientation = getattr(stats, "sensor_orientation", None) or "" - data_interval_type = getattr(stats, "data_interval_type", None) or self.interval data_type = getattr(stats, "data_type", None) or "variation" sensor_sampling_rate = getattr(stats, "sensor_sampling_rate", None) or 0.0 comments = getattr(stats, "filter_comments", None) or [""] declination_base = getattr(stats, "declination_base", None) or 0.0 - publication_level = IMCDFPublicationLevel(data_type=self.type).get_level() + publication_level = IMCDFPublicationLevel(value=self.type).get_level() global_attrs = { "FormatDescription": {0: "INTERMAGNET CDF Format"}, "FormatVersion": {0: "1.2"}, @@ -627,7 +628,7 @@ class ImagCDFFactory(TimeseriesFactory): ) sensor_sampling_rate = global_attrs.get("SensorSamplingRate", [None])[0] - comments = global_attrs.get("Comments", [None]) # keep comments as an array + comments = global_attrs.get("Comments", []) # keep comments as an array declination_base = global_attrs.get("DeclinationBase", [None])[0] network = global_attrs.get("Network", [None])[0] @@ -666,9 +667,6 @@ class ImagCDFFactory(TimeseriesFactory): # Determine DEPEND_0 (the time variable name) and validate ts_name = attrs.get("DEPEND_0") - # if not ts_name: - # # If no DEPEND_0, skip this variable as we cannot map times - # continue # The ImagCDF can have DataTimes, GeomagneticVectorTimes, GeomagneticScalarTimes, TemperatureNTimes (for N > 0), etc. matched_time_key = None @@ -677,9 +675,6 @@ class ImagCDFFactory(TimeseriesFactory): matched_time_key = tkey break - # if matched_time_key not in time_vars: - # # If we cannot find the matching time variable, skip this variable - # continue times = [] if matched_time_key in time_vars: times = time_vars[matched_time_key] @@ -749,7 +744,7 @@ class ImagCDFFactory(TimeseriesFactory): header.update({"data_interval_type": data_interval}) if declination_base is not None: header.update({"declination_base": declination_base}) - if comments is not None: + if len(comments) > 0: header.update({"filter_comments": comments}) if network is not None: header.update({"network": network}) @@ -775,8 +770,7 @@ class ImagCDFFactory(TimeseriesFactory): Generate the file URL specific to ImagCDF conventions. This method constructs the filename based on the ImagCDF naming - conventions, which include the observatory code, date-time formatted - according to the data interval, and the publication level. + conventions. [iaga-code]_[date-time]_[publication-level].cdf @@ -794,9 +788,8 @@ class ImagCDFFactory(TimeseriesFactory): - ImagCDF Technical Documentation: ImagCDF File Names """ # Get the publication level for the type - publication_level = IMCDFPublicationLevel(data_type=type).get_level() + publication_level = IMCDFPublicationLevel(value=type).get_level() - # Format of Date/Time Portion of Filename based on interval see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#example-data-file:~:text=Format%20of%20Date,%EF%83%81 if interval == "year": date_format = date.strftime("%Y") elif interval == "month": @@ -814,7 +807,6 @@ class ImagCDFFactory(TimeseriesFactory): f"Unsupported interval: {interval}" ) # tenhertz currently not supported - # Filename following ImagCDF convention, see reference: https://tech-man.intermagnet.org/latest/appendices/dataformats.html#imagcdf-file-names filename = f"{observatory.lower()}_{date_format}_{publication_level}.cdf" # If the urlTemplate explicitly specifies 'stdout', return 'stdout' -- GitLab From 2a6ccab6c4f94a5180db90a39798880537b77a34 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 19 Dec 2024 09:21:11 -0800 Subject: [PATCH 18/30] ImagCDF Factory moved into Stream compatible block --- geomagio/Controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 199befdd..a5777a2f 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -636,8 +636,6 @@ def get_output_factory(args): ) elif output_type == "plot": output_factory = PlotTimeseriesFactory() - elif output_type == "imagcdf": - output_factory = ImagCDFFactory(**output_factory_args) else: # stream compatible factories if output_type == "binlog": @@ -668,6 +666,8 @@ def get_output_factory(args): ) elif output_type == "xml": output_factory = xml.XMLFactory(**output_factory_args) + elif output_type == "imagcdf": + output_factory = ImagCDFFactory(**output_factory_args) # wrap stream if output_stream is not None: output_factory = StreamTimeseriesFactory( -- GitLab From b4123eb149105862aeb6867866d7497a8164b402 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 19 Dec 2024 09:45:43 -0800 Subject: [PATCH 19/30] remove /etc/imagcdf path from urlTemplate, urlInterval removed from constructor --- geomagio/imagcdf/ImagCDFFactory.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index f25d0c27..fda30542 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -51,8 +51,7 @@ class ImagCDFFactory(TimeseriesFactory): channels: List[str] = ("H", "D", "Z", "F"), type: DataType = "variation", interval: DataInterval = "minute", - urlTemplate="file://etc/imagcdf/{obs}_{dt}_{t}.cdf", - urlInterval: int = -1, + urlTemplate="file://{obs}_{dt}_{t}.cdf", inputFile: Optional[str] = None, output: Optional[str] = "iaga2002", ): @@ -74,7 +73,6 @@ class ImagCDFFactory(TimeseriesFactory): type=type, interval=interval, urlTemplate=urlTemplate, - urlInterval=urlInterval, ) self.inputFile = inputFile self.output = ( @@ -419,9 +417,10 @@ class ImagCDFFactory(TimeseriesFactory): nonstandard_times = None temperature_times = {} temperature_index = 1 - + channels = [] for trace in timeseries: channel = trace.stats.channel + channels.append(channel) times = [ (trace.stats.starttime + trace.stats.delta * i).datetime.replace( tzinfo=timezone.utc @@ -479,8 +478,8 @@ class ImagCDFFactory(TimeseriesFactory): last_times = [] self.isUniqueTimes = ( - len(time_vars) == 1 - ) # (true if is a single time stamp variable in the file) + len(channels) == 1 + ) # (If a single channel, then use specific naming i.e GeomagneticVectorTimes) for index, times in enumerate(time_vars.values()): if index > 0: self.isUniqueTimes = not np.array_equal(last_times, times) @@ -845,7 +844,7 @@ class ImagCDFFactory(TimeseriesFactory): base_path = self.urlTemplate[7:] if not base_path or base_path == "{obs}_{dt}_{t}.cdf": base_path = os.getcwd() # Default to current working directory - return os.path.join(base_path, "etc", "imagcdf", filename) + return os.path.join(base_path,filename) return os.path.join(self.urlTemplate, filename) # Unsupported URL scheme -- GitLab From 8a9da540d0a47cf0f9413bbca854feeed48e3a39 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 19 Dec 2024 11:20:00 -0800 Subject: [PATCH 20/30] necessary tweaks: urlInterval brought back, remove need for cdf extension on input-file --- geomagio/Controller.py | 6 +++--- geomagio/imagcdf/ImagCDFFactory.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index a5777a2f..c79cdd16 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -7,8 +7,6 @@ from typing import List, Optional, Tuple, Union from obspy.core import Stream, UTCDateTime -from geomagio.imagcdf.ImagCDFFactory import ImagCDFFactory - from .algorithm import Algorithm, algorithms, AlgorithmException, FilterAlgorithm from .DerivedTimeseriesFactory import DerivedTimeseriesFactory from .PlotTimeseriesFactory import PlotTimeseriesFactory @@ -504,8 +502,10 @@ def get_input_factory(args): input_factory_args["output"] = args.output # stream/url arguments if args.input_file is not None: - if input_type in ["netcdf", "miniseed", "imagcdf"]: + if input_type in ["netcdf", "miniseed"]: input_stream = open(args.input_file, "rb") + elif input_type in ["imagcdf"]: + input_factory_args["inputFile"] = args.input_file #imagcdf file is binary but lib used accepts a file path else: input_stream = open(args.input_file, "r") elif args.input_stdin: diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index fda30542..643238bc 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -54,6 +54,7 @@ class ImagCDFFactory(TimeseriesFactory): urlTemplate="file://{obs}_{dt}_{t}.cdf", inputFile: Optional[str] = None, output: Optional[str] = "iaga2002", + urlInterval: int= -1 ): """ Initialize the ImagCDFFactory with default parameters. @@ -73,6 +74,7 @@ class ImagCDFFactory(TimeseriesFactory): type=type, interval=interval, urlTemplate=urlTemplate, + urlInterval=urlInterval ) self.inputFile = inputFile self.output = ( -- GitLab From e2f206bca1267db883d13057f704984496c85473 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 19 Dec 2024 11:35:42 -0800 Subject: [PATCH 21/30] import aggregation --- geomagio/Controller.py | 6 +++--- geomagio/imagcdf/ImagCDFFactory.py | 6 +++--- geomagio/imagcdf/__init__.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 geomagio/imagcdf/__init__.py diff --git a/geomagio/Controller.py b/geomagio/Controller.py index c79cdd16..d9b73750 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -26,7 +26,7 @@ from . import vbf from . import xml from . import covjson from . import netcdf -from geomagio.imagcdf.ImagCDFFactory import ImagCDFFactory +from . import imagcdf class Controller(object): @@ -574,7 +574,7 @@ def get_input_factory(args): input_factory = covjson.CovJSONFactory(**input_factory_args) elif input_type == "imagcdf": input_factory_args["output"] = args.output - input_factory = ImagCDFFactory(**input_factory_args) + input_factory = imagcdf.ImagCDFFactory(**input_factory_args) # wrap stream if input_stream is not None: input_factory = StreamTimeseriesFactory( @@ -667,7 +667,7 @@ def get_output_factory(args): elif output_type == "xml": output_factory = xml.XMLFactory(**output_factory_args) elif output_type == "imagcdf": - output_factory = ImagCDFFactory(**output_factory_args) + output_factory = imagcdf.ImagCDFFactory(**output_factory_args) # wrap stream if output_stream is not None: output_factory = StreamTimeseriesFactory( diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 643238bc..72a0dc19 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -54,7 +54,7 @@ class ImagCDFFactory(TimeseriesFactory): urlTemplate="file://{obs}_{dt}_{t}.cdf", inputFile: Optional[str] = None, output: Optional[str] = "iaga2002", - urlInterval: int= -1 + urlInterval: int = -1, ): """ Initialize the ImagCDFFactory with default parameters. @@ -74,7 +74,7 @@ class ImagCDFFactory(TimeseriesFactory): type=type, interval=interval, urlTemplate=urlTemplate, - urlInterval=urlInterval + urlInterval=urlInterval, ) self.inputFile = inputFile self.output = ( @@ -846,7 +846,7 @@ class ImagCDFFactory(TimeseriesFactory): base_path = self.urlTemplate[7:] if not base_path or base_path == "{obs}_{dt}_{t}.cdf": base_path = os.getcwd() # Default to current working directory - return os.path.join(base_path,filename) + return os.path.join(base_path, filename) return os.path.join(self.urlTemplate, filename) # Unsupported URL scheme diff --git a/geomagio/imagcdf/__init__.py b/geomagio/imagcdf/__init__.py new file mode 100644 index 00000000..e7611ab4 --- /dev/null +++ b/geomagio/imagcdf/__init__.py @@ -0,0 +1,10 @@ +"""IO Module for ImagCDF Format +""" + +from __future__ import absolute_import + +from .ImagCDFFactory import ImagCDFFactory +from .IMCDFPublication import IMCDFPublicationLevel + + +__all__ = ["ImagCDFFactory", "IMCDFPublicationLevel"] -- GitLab From bdc4f7850995bf300cff9c76ef844c5978828368 Mon Sep 17 00:00:00 2001 From: "E. Joshua Rigler" <erigler@usgs.gov> Date: Mon, 13 Jan 2025 13:59:11 -0700 Subject: [PATCH 22/30] Testing modify MR while not owner --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9de81eed..d06d0724 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,8 @@ Geomag Algorithms is an open source library for processing Geomagnetic timeseries data. It includes algorithms and input/output factories used by the [USGS Geomagnetism Program](http://geomag.usgs.gov) to -translate between data formats, -generate derived data and indices in near-realtime, -and research and develop new algorithms. +translate between data formats, generate derived data and indices in +near-realtime, and research and develop new algorithms. - Accesses USGS data services. - Built using established open source python libraries -- GitLab From 381c535d74088c29d6230dd2de49eb9e84dd8d5c Mon Sep 17 00:00:00 2001 From: "Erin (Josh) Rigler" <erigler@usgs.gov> Date: Mon, 13 Jan 2025 21:09:42 +0000 Subject: [PATCH 23/30] Testing modify MR while not owner - from Gitlab --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d06d0724..aef477d8 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ [](https://code.usgs.gov/ghsc/geomag/geomag-algorithms/-/commits/master) Geomag Algorithms is an open source library for processing -Geomagnetic timeseries data. It includes algorithms and input/output factories -used by the [USGS Geomagnetism Program](http://geomag.usgs.gov) to +Geomagnetic timeseries data. It includes algorithms and input/output +factories used by the [USGS Geomagnetism Program](http://geomag.usgs.gov) to translate between data formats, generate derived data and indices in near-realtime, and research and develop new algorithms. -- GitLab From 39f0e813366232c2c7e3cff84d804a06b7c45ea0 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Wed, 15 Jan 2025 10:09:01 -0800 Subject: [PATCH 24/30] skip writing a timeseries if channel not in channels requested --- geomagio/imagcdf/ImagCDFFactory.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 72a0dc19..414aaf99 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -131,6 +131,8 @@ class ImagCDFFactory(TimeseriesFactory): temperature_index = 0 for trace in timeseries: channel = trace.stats.channel + if channels and channel not in channels: + continue var_name = f"GeomagneticField{channel}" # if channel in REAL_TEMPERATURE: # temperature_index += 1 # MUST INCREMENT INDEX BEFORE USING -- GitLab From 40f43349fb4de94b82b0989bb10558244f06f5ee Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Wed, 15 Jan 2025 13:08:31 -0800 Subject: [PATCH 25/30] poetry lock --- poetry.lock | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/poetry.lock b/poetry.lock index a4aac078..3a6030c1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -38,13 +38,13 @@ docs = ["sphinx (==7.2.6)", "sphinx-mdinclude (==0.5.3)"] [[package]] name = "alembic" -version = "1.13.3" +version = "1.14.0" description = "A database migration tool for SQLAlchemy." optional = false python-versions = ">=3.8" files = [ - {file = "alembic-1.13.3-py3-none-any.whl", hash = "sha256:908e905976d15235fae59c9ac42c4c5b75cfcefe3d27c0fbf7ae15a37715d80e"}, - {file = "alembic-1.13.3.tar.gz", hash = "sha256:203503117415561e203aa14541740643a611f641517f0209fcae63e9fa09f1a2"}, + {file = "alembic-1.14.0-py3-none-any.whl", hash = "sha256:99bd884ca390466db5e27ffccff1d179ec5c05c965cfefc0607e69f9e411cb25"}, + {file = "alembic-1.14.0.tar.gz", hash = "sha256:b00892b53b3642d0b8dbedba234dbf1924b69be83a9a769d5a624b01094e304b"}, ] [package.dependencies] @@ -711,13 +711,13 @@ files = [ [[package]] name = "dparse" -version = "0.6.3" +version = "0.6.4" description = "A parser for Python dependency files" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "dparse-0.6.3-py3-none-any.whl", hash = "sha256:0d8fe18714056ca632d98b24fbfc4e9791d4e47065285ab486182288813a5318"}, - {file = "dparse-0.6.3.tar.gz", hash = "sha256:27bb8b4bcaefec3997697ba3f6e06b2447200ba273c0b085c3d012a04571b528"}, + {file = "dparse-0.6.4-py3-none-any.whl", hash = "sha256:fbab4d50d54d0e739fbb4dedfc3d92771003a5b9aa8545ca7a7045e3b174af57"}, + {file = "dparse-0.6.4.tar.gz", hash = "sha256:90b29c39e3edc36c6284c82c4132648eaf28a01863eb3c231c2512196132201a"}, ] [package.dependencies] @@ -725,18 +725,20 @@ packaging = "*" tomli = {version = "*", markers = "python_version < \"3.11\""} [package.extras] +all = ["pipenv", "poetry", "pyyaml"] conda = ["pyyaml"] -pipenv = ["pipenv (<=2022.12.19)"] +pipenv = ["pipenv"] +poetry = ["poetry"] [[package]] name = "et-xmlfile" -version = "1.1.0" +version = "2.0.0" description = "An implementation of lxml.xmlfile for the standard library" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, - {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] [[package]] @@ -1793,13 +1795,13 @@ typing-extensions = ">=3.7.4" [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -2497,23 +2499,23 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo [[package]] name = "setuptools" -version = "75.2.0" +version = "75.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"}, - {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"}, + {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, + {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, ] [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] -core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] [[package]] name = "six" @@ -2643,13 +2645,13 @@ SQLAlchemy = ">=0.9.0" [[package]] name = "starlette" -version = "0.41.0" +version = "0.41.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.8" files = [ - {file = "starlette-0.41.0-py3-none-any.whl", hash = "sha256:a0193a3c413ebc9c78bff1c3546a45bb8c8bcb4a84cae8747d650a65bd37210a"}, - {file = "starlette-0.41.0.tar.gz", hash = "sha256:39cbd8768b107d68bfe1ff1672b38a2c38b49777de46d2a592841d58e3bf7c2a"}, + {file = "starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"}, + {file = "starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835"}, ] [package.dependencies] @@ -3064,4 +3066,4 @@ pycurl = ["pycurl"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "de2ec42ee0160626455b58e0b8f67e2669ad3ef5e13d58bc4500d0b8a1643cb8" +content-hash = "326ddbf7328f54b06ea3cd323053134836fb707d18599929a23a3bd2abf06ade" -- GitLab From ad5f8ed38e4dfd4b62ecc0702313e677fcf22fa5 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 16 Jan 2025 13:54:35 -0800 Subject: [PATCH 26/30] needs createParentDir set to true --- geomagio/imagcdf/ImagCDFFactory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 414aaf99..914bba78 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -238,7 +238,7 @@ class ImagCDFFactory(TimeseriesFactory): # Handle 'file://' output elif url.startswith("file://"): # Get the file path from the URL - url_file = Util.get_file_from_url(url, createParentDirectory=False) + url_file = Util.get_file_from_url(url, createParentDirectory=True) url_data = timeseries.slice( starttime=interval_start, endtime=interval_end, -- GitLab From 351333dd24dd1e6a3afa5ed7c8b054c23998c206 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Thu, 16 Jan 2025 14:31:48 -0800 Subject: [PATCH 27/30] expand allowed elements - to any. units set to known, validmin/max set to default. --- geomagio/imagcdf/ImagCDFFactory.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/geomagio/imagcdf/ImagCDFFactory.py b/geomagio/imagcdf/ImagCDFFactory.py index 914bba78..b68f03d5 100644 --- a/geomagio/imagcdf/ImagCDFFactory.py +++ b/geomagio/imagcdf/ImagCDFFactory.py @@ -506,7 +506,8 @@ class ImagCDFFactory(TimeseriesFactory): ) -> dict: channel = trace.stats.channel.upper() fieldnam = f"Geomagnetic Field Element {channel}" # “Geomagnetic Field Element †+ the element code or “Temperature †+ the name of the location where the temperature was recorded. - units = "" # Must be one of “nTâ€, “Degrees of arc†or “Celsius†+ units = "" # Should be one of “nTâ€, “Degrees of arc†or “Celsius†to adhere to standard. + depend_0 = "" if channel == "D": units = "Degrees of arc" validmin = -360.0 @@ -535,16 +536,22 @@ class ImagCDFFactory(TimeseriesFactory): units = "nT" validmin = -79_999.0 validmax = 79_999.0 + else: + units = "Unknown" # unknown or possibly assume to be nT + validmin = -79_999 # largest valid min for all other elements + validmax = 79_999 # largest valid max for all other elements + depend_0 = "DataTimes" # can be used for nonstandard element - # Determine DEPEND_0 based on channel type - if not isUniqueTimes: - depend_0 = "DataTimes" - elif channel in self._get_vector_elements(): - depend_0 = "GeomagneticVectorTimes" - elif channel in self._get_scalar_elements(): - depend_0 = "GeomagneticScalarTimes" - # elif channel in REAL_TEMPERATURES: - # depend_0 = f"Temperature{temperature_index}Times" + # Determine DEPEND_0 based on channel type if necessary + if not depend_0: + if not isUniqueTimes: + depend_0 = "DataTimes" + elif channel in self._get_vector_elements(): + depend_0 = "GeomagneticVectorTimes" + elif channel in self._get_scalar_elements(): + depend_0 = "GeomagneticScalarTimes" + # elif channel in REAL_TEMPERATURES: + # depend_0 = f"Temperature{temperature_index}Times" var_attrs = { "FIELDNAM": fieldnam, -- GitLab From 61f1cac360d3aea55f4720f8d180f0385ac912b4 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 17 Jan 2025 11:26:03 -0800 Subject: [PATCH 28/30] lint + rebasing --- geomagio/Controller.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index d9b73750..97d46793 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -505,7 +505,9 @@ def get_input_factory(args): if input_type in ["netcdf", "miniseed"]: input_stream = open(args.input_file, "rb") elif input_type in ["imagcdf"]: - input_factory_args["inputFile"] = args.input_file #imagcdf file is binary but lib used accepts a file path + input_factory_args["inputFile"] = ( + args.input_file + ) # imagcdf file is binary but lib used accepts a file path else: input_stream = open(args.input_file, "r") elif args.input_stdin: @@ -841,7 +843,7 @@ def parse_args(args): "xml", "covjson", "netcdf", - "imagcdf" + "imagcdf", ), default="edge", help='Input format (Default "edge")', -- GitLab From f086538c963096ea320edb8dae7cc168197de4fa Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 17 Jan 2025 11:28:37 -0800 Subject: [PATCH 29/30] poetry lock --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 3a6030c1..05a9e1d1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3066,4 +3066,4 @@ pycurl = ["pycurl"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "326ddbf7328f54b06ea3cd323053134836fb707d18599929a23a3bd2abf06ade" +content-hash = "517a488d608ab52b45699d4605d4f2c4f9dd57533102a42e026049be71b9a3ce" -- GitLab From 0a91ddd97d51e92b9eac6c8ab1cc8b7069830ff5 Mon Sep 17 00:00:00 2001 From: Nicholas Shavers <nshavers@contractor.usgs.gov> Date: Fri, 17 Jan 2025 13:25:31 -0800 Subject: [PATCH 30/30] small bug fixes: dupe code line + import of DerivedTimeseriesFactory --- geomagio/Controller.py | 1 - geomagio/api/ws/data.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/geomagio/Controller.py b/geomagio/Controller.py index 97d46793..23661817 100644 --- a/geomagio/Controller.py +++ b/geomagio/Controller.py @@ -499,7 +499,6 @@ def get_input_factory(args): input_factory_args["observatory"] = args.observatory input_factory_args["type"] = args.type input_type = args.input - input_factory_args["output"] = args.output # stream/url arguments if args.input_file is not None: if input_type in ["netcdf", "miniseed"]: diff --git a/geomagio/api/ws/data.py b/geomagio/api/ws/data.py index 1eb735fd..c2bc9227 100644 --- a/geomagio/api/ws/data.py +++ b/geomagio/api/ws/data.py @@ -5,7 +5,8 @@ from fastapi import APIRouter, Depends, Query, Request from obspy import UTCDateTime, Stream from starlette.responses import Response -from ... import DerivedTimeseriesFactory, TimeseriesFactory, TimeseriesUtility +from ... import TimeseriesFactory, TimeseriesUtility +from ...DerivedTimeseriesFactory import DerivedTimeseriesFactory from ...edge import EdgeFactory, FDSNFactory, MiniSeedFactory from ...iaga2002 import IAGA2002Writer from ...imfjson import IMFJSONWriter -- GitLab