importRDB1.r


#' Function to return data from the NWIS RDB 1.0 format
#'
#' This function accepts a url parameter that already contains the desired
#' NWIS site, parameter code, statistic, startdate and enddate. It is not
#' recommended to use the RDB format for importing multi-site data. 
#'
#' @param obs_url character containing the url for the retrieval or a file path to the data file.
#' @param asDateTime logical, if \code{TRUE} returns date and time as POSIXct, if \code{FALSE}, Date
#' @param tz character to set timezone attribute of datetime. Default is an empty quote, which converts the 
#' datetimes to UTC (properly accounting for daylight savings times based on the data's provided tz_cd column).
#' Possible values to provide are "America/New_York","America/Chicago", "America/Denver","America/Los_Angeles",
#' "America/Anchorage","America/Honolulu","America/Jamaica","America/Managua","America/Phoenix", and "America/Metlakatla"
#' @param convertType logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates, datetimes,
#' numerics based on a standard algorithm. If false, everything is returned as a character
#' @return A data frame with the following columns:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' agency_cd \tab character \tab The NWIS code for the agency reporting the data\cr
#' site_no \tab character \tab The USGS site number \cr
#' datetime \tab POSIXct \tab The date and time of the value converted to UTC (if asDateTime = \code{TRUE}), \cr 
#' \tab character \tab or raw character string (if asDateTime = FALSE) \cr
#' tz_cd \tab character \tab The time zone code for datetime \cr
#' code \tab character \tab Any codes that qualify the corresponding value\cr
#' value \tab numeric \tab The numeric value for the parameter \cr
#' tz_cd_reported \tab The originally reported time zone \cr
#' }
#' Note that code and value are repeated for the parameters requested. The names are of the form 
#' XD_P_S, where X is literal, 
#' D is an option description of the parameter, 
#' P is the parameter code, 
#' and S is the statistic code (if applicable).
#' If a date/time (dt) column contained incomplete date and times, a new column of dates and time was inserted. This could happen
#' when older data was reported as dates, and newer data was reported as a date/time.
#' 
#' There are also several useful attributes attached to the data frame:
#' \tabular{lll}{
#' Name \tab Type \tab Description \cr
#' url \tab character \tab The url used to generate the data \cr
#' queryTime \tab POSIXct \tab The time the data was returned \cr
#' comment \tab character \tab Header comments from the RDB file \cr
#' }
#' @export
#' @import RCurl
#' @import utils
#' @import stats
#' @importFrom dplyr left_join
#' @importFrom readr read_lines
#' @importFrom readr read_delim
#' @importFrom readr problems
#' @importFrom readr parse_number
#' @examples
#' siteNumber <- "02177000"
#' startDate <- "2012-09-01"
#' endDate <- "2012-10-01"
#' offering <- "00003"
#' property <- "00060"
#' 
#' obs_url <- constructNWISURL(siteNumber,property,
#'          startDate,endDate,"dv",format="tsv")
#' \dontrun{
#' data <- importRDB1(obs_url)
#' 
#' urlMultiPcodes <- constructNWISURL("04085427",c("00060","00010"),
#'          startDate,endDate,"dv",statCd=c("00003","00001"),"tsv")
#' multiData <- importRDB1(urlMultiPcodes)
#' unitDataURL <- constructNWISURL(siteNumber,property,
#'          "2013-11-03","2013-11-03","uv",format="tsv") #includes timezone switch
#' unitData <- importRDB1(unitDataURL, asDateTime=TRUE)
#' qwURL <- constructNWISURL(c('04024430','04024000'),
#'           c('34247','30234','32104','34220'),
#'          "2010-11-03","","qw",format="rdb") 
#' qwData <- importRDB1(qwURL, asDateTime=TRUE, tz="America/Chicago")
#' iceSite <- '04024000'
#' start <- "2015-11-09"
#' end <- "2015-11-24"
#' urlIce <- constructNWISURL(iceSite,"00060",start, end,"uv",format="tsv")
#' ice <- importRDB1(urlIce, asDateTime=TRUE)
#' iceNoConvert <- importRDB1(urlIce, convertType=FALSE)
#' }
#' # User file:
#' filePath <- system.file("extdata", package="dataRetrieval")
#' fileName <- "RDB1Example.txt"
#' fullPath <- file.path(filePath, fileName)
#' importUserRDB <- importRDB1(fullPath)
#' 
importRDB1 <- function(obs_url, asDateTime=TRUE, convertType = TRUE, tz=""){
  
  if(tz != ""){
    tz <- match.arg(tz, c("America/New_York","America/Chicago",
                          "America/Denver","America/Los_Angeles",
                          "America/Anchorage","America/Honolulu",
                          "America/Jamaica","America/Managua",
                          "America/Phoenix","America/Metlakatla","UTC"))
  }
  

  if(file.exists(obs_url)){
    doc <- obs_url
  } else {
    doc <- getWebServiceData(obs_url, encoding='gzip')
    if("warn" %in% names(attr(doc,"header"))){
      data <- data.frame()
      attr(data, "header") <- attr(doc,"header")
      attr(data, "url") <- obs_url
      attr(data, "queryTime") <- Sys.time()
      
      return(data)
    }
  }
  
  readr.total <- read_lines(doc)
  total.rows <- length(readr.total)
  readr.meta <- readr.total[grep("^#", readr.total)]
  meta.rows <- length(readr.meta)
  header.names <- strsplit(readr.total[meta.rows+1],"\t")[[1]]

  if(convertType){
    readr.data <- suppressWarnings(read_delim(doc, skip = (meta.rows+2),delim="\t",col_names = FALSE))
  } else {
    readr.data <- read_delim(doc, skip = (meta.rows+2),delim="\t",col_names = FALSE, col_types = cols(.default = "c"))
  }
  
  names(readr.data) <- header.names
  
  if("site_no" %in% names(readr.data)){
    if(is.integer(readr.data$site_no)){
      readr.data.char <- read_delim(doc, skip = (meta.rows+2),delim="\t",col_names = FALSE, 
                                    col_types = cols(.default = "c"))
      names(readr.data.char) <- header.names
      readr.data$site_no <- readr.data.char$site_no
    }
  }
  
  badCols <- problems(readr.data)$col
  if(length(badCols) > 0){
    unique.bad.cols <- unique(badCols)
    
    index.col <- as.integer(gsub("X","",unique.bad.cols))
    
    if(!(all(header.names[index.col] %in% "site_no"))){
      unique.bad.cols <- unique.bad.cols[!(header.names[index.col] %in% "site_no")]
      index.col <- as.integer(gsub("X","",unique.bad.cols))
      unique.bad.cols.names <- header.names[index.col]
      if(!exists("readr.data.char")){
        readr.data.char <- read_delim(doc, skip = (meta.rows+2),delim="\t",col_names = FALSE, 
                                      col_types = cols(.default = "c"))
      }
      readr.data[,unique.bad.cols.names] <- lapply(readr.data.char[,unique.bad.cols], parse_number)
    }
  }
  
  comment(readr.data) <- readr.meta
  readr.data <- as.data.frame(readr.data)
  
  if (asDateTime & convertType){

    header.suffix <- sapply(strsplit(header.names,"_"), function(x)x[length(x)])
    header.base <- substr(header.names,1,nchar(header.names)-3)
    
    for(i in unique(header.base[header.suffix %in% c("dt","tm")])){
      
      if(all(c(paste0(i,"_dt"),paste0(i,"_tm")) %in% header.names)){
        varname <- paste0(i,"_dateTime")
        
        varval <- as.POSIXct(paste(readr.data[,paste0(i,"_dt")],readr.data[,paste0(i,"_tm")]), "%Y-%m-%d %H:%M", tz = "UTC")
        readr.data[,varname] <- varval
        
        tz.name <- paste0(i,"_time_datum_cd")
        
        if(tz.name %in% header.names){
          readr.data <- convertTZ(readr.data,tz.name,varname,tz)
        }
        
        tz.name <- paste0(i,"_tz_cd")
        
        if(tz.name %in% header.names){
          readr.data <- convertTZ(readr.data,tz.name,varname,tz)
        }
      }
    }
    
    if("tz_cd" %in% header.names){
      date.time.cols <- which(sapply(readr.data, function(x) inherits(x, "POSIXct")))
      readr.data <- convertTZ(readr.data,"tz_cd",date.time.cols,tz, flip.cols=FALSE)
    }
    
    if("sample_start_time_datum_cd" %in% header.names){
      readr.data <- convertTZ(readr.data,"sample_start_time_datum_cd","sample_dateTime",tz)
      
      if(!("sample_end_time_datum_cd" %in% header.names) & "sample_end_dateTime" %in% names(readr.data)){
        readr.data <- convertTZ(readr.data,"sample_start_time_datum_cd_reported","sample_end_dateTime",tz)
        readr.data$sample_start_time_datum_cd_reported<- readr.data$sample_start_time_datum_cd_reported_reported 
        readr.data$sample_start_time_datum_cd_reported_reported <- NULL 
      }
    }
    
  }
  
  names(readr.data)[names(readr.data) == "sample_dateTime"] <- "startDateTime"
  names(readr.data)[names(readr.data) == "sample_end_dateTime"] <- "endDateTime"
  
  if("site_no" %in% header.names){
    if(class(readr.data$site_no) != "character"){
      readr.data$site_no <- as.character(readr.data$site_no)
    }
  }
  
  row.names(readr.data) <- NULL

  names(readr.data) <- make.names(names(readr.data))
  
  attr(readr.data, "url") <- obs_url
  attr(readr.data, "queryTime") <- Sys.time()
  if(!file.exists(obs_url)){
    attr(readr.data, "header") <- attr(doc, "header")
  }
  
  return(readr.data)
  
}

convertTZ <- function(df, tz.name, date.time.cols, tz, flip.cols=TRUE){
  
  offsetLibrary <- data.frame(offset=c(5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 10, 0, 0),
                              code=c("EST","EDT","CST","CDT","MST","MDT","PST","PDT","AKST","AKDT","HAST","HST","", NA),
                              stringsAsFactors = FALSE)
  
  offset <- left_join(df[,tz.name,drop=FALSE],offsetLibrary, by=setNames("code",tz.name))
  offset <- offset$offset
  df[,paste0(tz.name,"_reported")] <- df[,tz.name,drop=FALSE]
  
  df[,date.time.cols] <- df[,date.time.cols] + offset*60*60
  df[,date.time.cols] <- as.POSIXct(df[,date.time.cols])
  
  if(tz != ""){
    attr(df[,date.time.cols], "tzone") <- tz
    df[,tz.name] <- tz
  } else {
    attr(df[,date.time.cols], "tzone") <- "UTC"
    df[!is.na(df[,date.time.cols]),tz.name] <- "UTC"
  }
  
  if(flip.cols){
    reported.col <- which(names(df) %in% paste0(tz.name,"_reported"))
    orig.col <- which(names(df) %in% tz.name)
    
    new.order <- 1:ncol(df)
    new.order[orig.col] <- reported.col
    new.order[reported.col] <- orig.col
    
    df <- df[,new.order]
  }
  return(df)
}