From b662348c3822d7eb19dde6e1bb2d4111190d6da8 Mon Sep 17 00:00:00 2001
From: "E. Joshua Rigler" <erigler@usgs.gov>
Date: Mon, 8 May 2023 10:20:47 -0600
Subject: [PATCH] Modify TimeseriesUtility.py's split_trace()

The split_trace() function always dropped the last sample. This was
seemingly part of a flawed logic designed to prevent traces being
written to Edge by the MiniSeedInputClient that spanned midnight.
This worked, but only if the original trace actually spanned midnight.
For the majority of data that do not span midnight, this dropped a
sample every time. The updated logic does what I believe was the
original intent (that is, it breaks traces into more manageable
chunks) without duplicating or losing any data. This is consistent
with obspy's concept of data slices, where a starttime and endtime
are always inclusive, but it deviates somewhat from MiniSeed logic,
which assumes a starttime, plus a delta, plus a number of samples.

It was necessary to update a couple unit tests for the MiniSeedFactory
to work with this new logic, but I am certain that the original test
logic was not based on anything but what the TimeseriesUtility.py
function(s) returned when they were originally written.
---
 geomagio/TimeseriesUtility.py          | 24 ++++++++++++++++++------
 test/edge_test/MiniSeedFactory_test.py | 16 ++++++++++++----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/geomagio/TimeseriesUtility.py b/geomagio/TimeseriesUtility.py
index c54d74983..f22ba28d9 100644
--- a/geomagio/TimeseriesUtility.py
+++ b/geomagio/TimeseriesUtility.py
@@ -621,13 +621,25 @@ def split_trace(trace: Trace, size: int = 86400) -> Stream:
         interval_start = interval["start"]
         interval_end = interval["end"]
         delta = out_trace.stats.delta
-        # accounts for trace containing one sample
         if interval_end - delta < interval_start:
+            # trace contains one sample
             stream += out_trace
             continue
-        stream += out_trace.slice(
-            starttime=interval_start,
-            endtime=interval_end - delta,
-            nearest_sample=False,
-        )
+        if interval_end.timestamp % size:
+            # trace does NOT contain first sample in next interval
+            stream += out_trace.slice(
+                starttime=interval_start, endtime=interval_end, nearest_sample=False
+            )
+        else:
+            # trace DOES contain first sample in next interval
+            stream += out_trace.slice(
+                starttime=interval_start,
+                endtime=interval_end - delta,
+                nearest_sample=False,
+            )
+            if interval_end == out_trace.stats.endtime:
+                # ONLY if it is the last interval
+                stream += out_trace.slice(
+                    starttime=interval_end, endtime=interval_end, nearest_sample=False
+                )
     return stream
diff --git a/test/edge_test/MiniSeedFactory_test.py b/test/edge_test/MiniSeedFactory_test.py
index 4858bc475..e5d5e1734 100644
--- a/test/edge_test/MiniSeedFactory_test.py
+++ b/test/edge_test/MiniSeedFactory_test.py
@@ -151,13 +151,19 @@ def test__pre_process():
     """edge_test.MiniSeedFactory_test.test__pre_process()"""
     trace = __create_trace(numpy.arange((86400 * 2) + 1), channel="H")
     processed = MiniSeedInputClient(host=None)._pre_process(stream=Stream(trace))
-    assert len(processed) == 2
-    for trace in processed:
+    assert len(processed) == 3
+    for trace in processed[0:2]:
         assert trace.data.dtype == "float32"
         stats = trace.stats
         assert stats.npts == 86400
         assert stats.starttime.timestamp % 86400 == 0
         assert stats.endtime.timestamp % 86400 != 0
+    for trace in processed[-1:]:
+        assert trace.data.dtype == "float32"
+        stats = trace.stats
+        assert stats.npts == 1
+        assert stats.starttime.timestamp % 86400 == 0
+        assert stats.starttime == stats.endtime
 
 
 def test__format_miniseed():
@@ -168,8 +174,10 @@ def test__format_miniseed():
     block_size = 512
     data = buf.getvalue()
     n_blocks = int(len(data) / block_size)
-    assert n_blocks == 1516
-    # 759th block is start of second day(758 blocks per day for 1Hz data)
+    assert n_blocks == 1517
+    # 759th block is start of second day
+    # (758 blocks per day for 1Hz data, which implies 56-byte,
+    #  NOT 64-byte, MiniSeed headers...something to investigate)
     block_start = 758 * block_size
     block = data[block_start : block_start + block_size]
     out_stream = read(io.BytesIO(block))
-- 
GitLab