From e1b43dbfd18e869020e072bc8108e0c57f30f482 Mon Sep 17 00:00:00 2001 From: Laura DeCicco <ldecicco@usgs.gov> Date: Thu, 14 Mar 2013 09:27:28 -0500 Subject: [PATCH] Added more official footnotes. --- inst/doc/dataRetrieval.Rnw | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/inst/doc/dataRetrieval.Rnw b/inst/doc/dataRetrieval.Rnw index 0b2e3c64..5c4ccc9f 100644 --- a/inst/doc/dataRetrieval.Rnw +++ b/inst/doc/dataRetrieval.Rnw @@ -12,6 +12,7 @@ \usepackage[numbers, round]{natbib} \usepackage[american]{babel} \usepackage{authblk} +\usepackage{footnote} \renewcommand\Affilfont{\itshape\small} \usepackage{Sweave} \renewcommand{\topfraction}{0.85} @@ -73,7 +74,7 @@ In this section, we will run through 5 examples, documenting how to get raw data %------------------------------------------------------------ \subsection{USGS Web Retrieval Introduction} %------------------------------------------------------------ -The United States Geological Survey organizes their hydrological data in fairly standard structure. Streamgages are located throughout the United States, and each streamgage has a unique ID. Often (but not always), these ID's are 8 digits. The first step to finding data is discoving this 8-digit ID. One potential tool for discovering data is Environmental Data Discovery and Transformation (EnDDaT): \url{http://cida.usgs.gov/enddat/}. Follow the example on the EnDDaT web page to learn how to discover USGS stations and available data from any location in the United States. +The United States Geological Survey organizes their hydrological data in standard structure. Streamgages are located throughout the United States, and each streamgage has a unique ID. Often (but not always), these ID's are 8 digits. The first step to finding data is discoving this 8-digit ID. One potential tool for discovering data is Environmental Data Discovery and Transformation (EnDDaT): \url{http://cida.usgs.gov/enddat/}. Follow the example on the EnDDaT web page to learn how to discover USGS stations and available data from any location in the United States. Once the site-ID is known, the next required input for USGS data retrievals is the 'parameter code'. This is a 5-digit code that specifies what measured paramater is being requested. A complete list of possible USGS parameter codes can be found at: @@ -102,7 +103,7 @@ print(data.table, caption.placement="top",include.rownames=FALSE) @ -For real-time data, the parameter code and site ID will suffice. For most variables that are measured on a continuous basis, the USGS stores the historical data as daily values. These daily values may be in the form statistics such as the daily mean values, but they can also include daily maximums, minimums or medians. These different statistics are specified by a 5-digit \texttt{"}stat code\texttt{"}. A complete list of stat codes can be found here: +For real-time data, the parameter code and site ID will suffice. For most variables that are measured on a continuous basis, the USGS stores the historical data as daily values. These daily values may be in the form of statistics such as the daily mean values, but they can also include daily maximums, minimums or medians. These different statistics are specified by a 5-digit \texttt{"}stat code\texttt{"}. A complete list of stat codes can be found here: \url{http://nwis.waterdata.usgs.gov/nwis/help/?read_file=stat&format=table} @@ -161,7 +162,7 @@ Parameter information is obtained from \url{http://nwis.waterdata.usgs.gov/nwis/ \subsection{USGS Daily Value Retrievals} \label{sec:usgsDaily} %------------------------------------------------------------ -To obtain historic daily records of USGS data, use the retrieveNWISData function. The arguments for this function are siteNumber, parameterCd, startDate, endDate, statCd, and a logical (true/false) interactive. There are 2 default argument: statCd defaults to \texttt{"}00003\texttt{"} and interactive defaults to TRUE. If you want to use the default values, you do not need to list them in the function call. Setting the 'interactive' option to true will walk you through the function. It might make more sense to run large batch collections with the interactive option set to FALSE. +To obtain historic daily records of USGS data, use the retrieveNWISData function. The arguments for this function are siteNumber, parameterCd, startDate, endDate, statCd, and a logical (true/false) interactive. There are 2 default argument: statCd (defaults to \texttt{"}00003\texttt{"}), and interactive (defaults to TRUE). If you want to use the default values, you do not need to list them in the function call. Setting the 'interactive' option to true will walk you through the function. It might make more sense to run large batch collections with the interactive option set to FALSE. The dates (start and end) need to be in the format \texttt{"}YYYY-MM-DD\texttt{"}. Setting the start date to \texttt{"}\texttt{"} will indicate to the program to ask for the earliest date, setting the end date to \texttt{"}\texttt{"} will ask for the latest available date. @@ -327,7 +328,7 @@ title(ChoptankInfo$station.nm) \subsection{Other Water Quality Retrievals} \label{sec:usgsSTORET} %------------------------------------------------------------ -There are additional data sets available on the Water Quality Portal (\url{http://www.waterqualitydata.us/}). These data sets can be housed in either the STORET or NWIS database. Since STORET does not use USGS parameter codes, a 'characteristic name' must be supplied. The following example retrieves specific conductance from a DNR site in Wisconsin. +There are additional data sets available on the Water Quality Portal (\url{http://www.waterqualitydata.us/}). These data sets can be housed in either the STORET or NWIS database. Since STORET does not use USGS parameter codes, a \texttt{"}characteristic name\texttt{"} must be supplied. The following example retrieves specific conductance from a DNR site in Wisconsin. <<label=getQWData, echo=TRUE>>= specificCond <- getWQPData('WIDNR_WQX-10032762', @@ -346,7 +347,7 @@ In this section, we use 3 dataRetrieval functions to get sufficient data to perf %------------------------------------------------------------ \subsection{INFO Data} %------------------------------------------------------------ -The function to obtain \texttt{"}metadata\texttt{"}, data about the streamgage and measured parameters is getMetaData. This function essentially combines getSiteFileData and getParameterInfo, producing one dataframe called INFO. +The function to obtain metadata, or data about the streamgage and measured parameters is getMetaData. This function combines getSiteFileData and getParameterInfo, producing one dataframe called INFO. <<ThirdExample>>= INFO <-getMetaData(siteNumber,parameterCd, interactive=FALSE) @@ -357,14 +358,15 @@ Column names in the INFO dataframe are listed in Appendix 2 (\ref{sec:appendix2I %------------------------------------------------------------ \subsection{Daily Data} %------------------------------------------------------------ -The function to obtain the daily values (discharge in this case) is getDVData. It requires the inputs siteNumber, ParameterCd, StartDate, EndDate, interactive, and convert. Most of these arguments are described in the previous section, however 'convert' is a new argument, the default is TRUE, and it tells the program to convert the values from cubic feet per second (cfs) to cubic meters per second (cms). For EGRET applications do not use this argument (the default is TRUE), EGRET assumes that discharge is always in cubic meters per second. If you don't want this conversion and are not using EGRET, set convert=FALSE in the function call. +The function to obtain the daily values (discharge in this case) is getDVData. It requires the inputs siteNumber, ParameterCd, StartDate, EndDate, interactive, and convert. Most of these arguments are described in the previous section, however \texttt{"}convert\texttt{"} is a new argument (defaults to TRUE), and it tells the program to convert the values from cubic feet per second (cfs) to cubic meters per second (cms). For EGRET applications with NWIS web retrieval, do not use this argument (the default is TRUE), EGRET assumes that discharge is always in cubic meters per second. If you don't want this conversion and are not using EGRET, set convert=FALSE in the function call. <<firstExample>>= siteNumber <- "01491000" parameterCd <- "00631" # Nitrate startDate <- "1964-01-01" endDate <- "2013-01-01" - +# This call will get NWIS data that is in cfs, and convert it +# to cms since we didn't override the default in the convert argument: Daily <- getDVData(siteNumber, "00060", startDate, endDate,interactive=FALSE) @ @@ -384,7 +386,7 @@ print(data.table, caption.placement="top",floating="FALSE",latex.environments=NU @ \\* -The code will shift the discharge values to 0.001 times the mean if there are zero values detected in order to perform the logarithm. Columns Q7 and Q30 are 7 and 30 day running averages. +If there are discharge values of zero, the code will add a small constant to all of the daily discharges. This constant is 0.001 times the mean discharge. The code will also report on the number of zero values and the size of the constant. EGRET should only be used if the number of zero values is a very small fraction of the total days in the record (say less than 0.1\% of the days). Columns Q7 and Q30 are the 7 and 30 day running averages for the 7 or 30 days ending on this specific date." %------------------------------------------------------------ \subsection{Sample Data} @@ -399,22 +401,22 @@ Sample <-getSampleData(siteNumber,parameterCd, Details of the Sample dataframe are listed below: <<label=colNamesQW, echo=FALSE,results=tex>>= -ColumnName <- c("Date", "ConcLow", "ConcHigh", "Uncen", "ConcAve", "Julian","Month","Day","DecYear","MonthSeq","SinDY","CosDY","Q","LogQ") +ColumnName <- c("Date", "ConcLow", "ConcHigh", "Uncen", "ConcAve", "Julian","Month","Day","DecYear","MonthSeq","SinDY","CosDY","Q footnote","LogQ footnote") Type <- c("Date", "number","number","integer","number", "number","integer","integer","number","integer","number","number","number","number") -Description <- c("Date", "Lower limit of concentration", "Upper limit of concentration", "Uncensored data (1=true, 0=false)", "Average of ConcLow and ConcHigh","Number of days since January 1, 1850", "Month of the year [1-12]", "Day of the year [1-366]", "Decimal year", "Number of months since January 1, 1850", "Sine of DecYear", "Cosine of DecYear", "Discharge **", "Natural logarithm of flow **") +Description <- c("Date", "Lower limit of concentration", "Upper limit of concentration", "Uncensored data (1=true, 0=false)", "Average of ConcLow and ConcHigh","Number of days since January 1, 1850", "Month of the year [1-12]", "Day of the year [1-366]", "Decimal year", "Number of months since January 1, 1850", "Sine of DecYear", "Cosine of DecYear", "Discharge", "Natural logarithm of flow") Units <- c("date","mg/L","mg/L","integer","mg/L","days","months","days","years","months","numeric","numeric","cms", "numeric") DF <- data.frame(ColumnName,Type,Description,Units) data.table <- xtable(DF, caption="Sample dataframe") -print(data.table, caption.placement="top",floating="FALSE",latex.environments=NULL,include.rownames=FALSE) +print(data.table, caption.placement="top",include.rownames=FALSE,table.placement="!ht", + sanitize.text.function=function(str)gsub("footnote","\\footnotemark[1]",str,fixed=TRUE)) @ -\\ -** Flow columns are populated from data in the Daily dataframe after calling the mergeReport function. +\footnotetext[1]{Flow columns are populated from data in the Daily dataframe after calling the mergeReport function.} +In the typical case where none of the data are censored (that is, no values are reported as \texttt{"}less-than\texttt{"} values) the ConcLow = ConcHigh = ConcAve all of which are equal to the reported value and Uncen=0. In the typical form of censoring where a value is reported as less than the reporting limit, then ConcLow = NA, ConcHigh = reporting limit, ConcAve = 0.5 * reporting limit, and Uncen = 1. The next section describes a more complex situation where concentrations are computed as the sum of one or more measured parameters. -In a more complex situation, the Sample data frame will combine all of the measured parameters. An example is provided to explain how the values are combined: %------------------------------------------------------------ \subsection{Complex Sample Data Example} -- GitLab