From dd71bde9b0a7eb7dce534cba7180899cbf50b02e Mon Sep 17 00:00:00 2001
From: "E. Joshua Rigler" <erigler@usgs.gov>
Date: Tue, 28 May 2024 14:10:01 -0600
Subject: [PATCH] Pull inputs in run(), not run_as_update()

For years, when invoking geomag-algorithms' update mechanism, inputs
were pulled and checked from the run_as_update() method inside the
Controller class. I never understood this. It seemed to break the
logic of the update mechanism if no inputs were available for the
current inteval/gap. This might even be the source of noted issues
where running back-filling scripts didn't behave as expected, and
those scripts needed to be run multiple times.

What's more, the run_as_update() method logically seems like the
most appropriate place to read *outputs* and check for gaps. Whereas
the run() method seems like the most appropriate place to read
inputs, apply algorithms, and write out outputs.

In any case, this change should not break any existing code. It
should only allow the update mechanism to complete, every time, and
as originally intended, rather than be short-circuited when input
data are missing for the current interval (but might be available
for previous intervals)..
---
 geomagio/Controller.py | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/geomagio/Controller.py b/geomagio/Controller.py
index 9d50ceb2..d9feb299 100644
--- a/geomagio/Controller.py
+++ b/geomagio/Controller.py
@@ -285,8 +285,12 @@ class Controller(object):
             channels=input_channels,
             interval=input_interval,
         )
-        if timeseries.count() == 0:
-            # no data to process
+        if not algorithm.can_produce_data(
+            starttime=timeseries[0].stats.starttime,
+            endtime=timeseries[0].stats.endtime,
+            stream=timeseries,
+        ):
+            # don't process if nothing will be produced
             return
         # pre-process
         if next_starttime and realtime:
@@ -416,18 +420,6 @@ class Controller(object):
                 ]
             ]
         for output_gap in output_gaps:
-            input_timeseries = self._get_input_timeseries(
-                algorithm=algorithm,
-                observatory=observatory,
-                starttime=output_gap[0],
-                endtime=output_gap[1],
-                channels=input_channels,
-                interval=input_interval,
-            )
-            if not algorithm.can_produce_data(
-                starttime=output_gap[0], endtime=output_gap[1], stream=input_timeseries
-            ):
-                continue
             # check for fillable gap at start
             if output_gap[0] == starttime:
                 # found fillable gap at start, recurse to previous interval
@@ -469,7 +461,6 @@ class Controller(object):
                 starttime=gap_starttime,
                 endtime=gap_endtime,
                 input_channels=input_channels,
-                input_timeseries=input_timeseries,
                 output_channels=output_channels,
                 input_interval=input_interval,
                 output_interval=output_interval,
-- 
GitLab