Skip to content
Snippets Groups Projects
2_process.R 14 KiB
Newer Older
  • Learn to ignore specific revisions
  • Azadpour, Elmera's avatar
    Azadpour, Elmera committed
    source('2_process/src/data_utils.R')
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
      # Confirming raw data matches `p1_unc_stats` from SB
      tar_target(p2_unc_agg_summary,
    
                 p1_unc_stats |>
    
    Cee Nell's avatar
    Cee Nell committed
                   group_by(dimension, determinant) |>
    
                   summarize(across(c(contains('related'),
                                      contains('unknown'),
    
                                      contains('significant'),
                                      contains('direction')),
    
                                    list(total = ~sum(.x, na.rm=TRUE)))) |> 
    
                   mutate(evidence_val = pos_related_total + neg_related_total +
                            unrelated_total + unk_direction_total)
    
      tar_target(p2_unc_agg_ind_summary,
                 p1_unc_stats |>
                   group_by(dimension, determinant, indicator) |>
                   summarize(across(c(contains('related'),
                                      contains('unknown'),
                                      contains('significant'),
                                      contains('direction')),
                                    list(total = ~sum(.x, na.rm=TRUE)))) |> 
                   mutate(evidence_val = pos_related_total + neg_related_total +
                            unrelated_total + unk_direction_total)
      ),
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
      # Based on metadata:
      # Amt of evidence: Small = total_studies < 5; Medium = total_studies 5-9; Large,total_studies = > 9
      # Amt of agreement: Low = < 50% of models; Medium = >50% & <74% of models; High = >74% of models; NA if the level of agreement could not be calculated as indicator was measured only once.
    
      # Dimension and determinant level 
    
      tar_target(p2_top_trend_stats,
                 p2_unc_agg_summary |>
    
    Cee Nell's avatar
    Cee Nell committed
                   dplyr::select(dimension, determinant, #indicator, 
    
                                 pos_related_total, neg_related_total, unrelated_total, 
                                 unk_direction_total) |>
    
                   #pivot_longer(!c(dimension,determinant)) |>
                   group_by(#dimension, 
                            determinant) |>
    
                   # for each determinant find the maximum % of studies in agreement 
    
    Cee Nell's avatar
    Cee Nell committed
                   # across the significance categories. 
    
                   #slice_max(value) |>
                   # rename(sig_name = name, sig_value = value)
                  mutate(sig_value = pmax(pos_related_total, neg_related_total, unrelated_total, unk_direction_total))
        
    
      tar_target(p2_top_trend_ind_stats,
                 p2_unc_agg_ind_summary |>
                   dplyr::select(dimension, determinant, indicator, 
                                 pos_related_total, neg_related_total, unrelated_total, 
                                 unk_direction_total) |>
                   pivot_longer(!c(dimension,determinant, indicator)) |>
                   group_by(dimension, determinant, indicator) |>
                   slice_max(value) |>
                   rename(sig_name = name, sig_value = value)
      ),
    # Join `p2_unc_agg_summary` to top trends to get percentages of agreement and evidence for determinant and nested dimension
    
      tar_target(`p2_unc_agg_summary_csv`,
    
                 p2_unc_agg_summary |>
                   left_join(p2_top_trend_stats) |>
    
    Cee Nell's avatar
    Cee Nell committed
                   # level of agreement is the max percent of studies in agreement
    
                   dplyr::mutate(level_agreement = 100*(sig_value/evidence_val),
                                 evidence_bin = case_when(
                                   evidence_val < 5 ~ "Small",
                                   between(evidence_val, 5, 9) ~ "Medium",
                                   evidence_val >= 10 ~ "Large"),
                                 agreement_bin = case_when(
                                   level_agreement < 50 ~ "Low",
                                   between(level_agreement, 51, 74) ~ "Medium",
                                   level_agreement > 74 ~ "High")) |>
    
                   # distinct(determinant, .keep_all = TRUE) |> 
                   readr::write_csv('public/determinant_uncertainty.csv')
    
    Cee Nell's avatar
    Cee Nell committed
                 ),
    
    # commented out for now so we don't overwrite spanish names
    #tar_target(p2_unc_determinant_json,
    #           read_csv(p2_unc_agg_summary_csv) |>
    #             toJSON(pretty = TRUE) |>
    #             write("public/determinant_uncertainty.json")
    #           ),
    
    tar_target(`p2_unc_agg_summary_ind_csv`,
               p2_unc_agg_ind_summary |>
                 left_join(p2_top_trend_ind_stats) |>
                 # level of agreement is the max percent of studies in agreement
                 dplyr::mutate(level_agreement = 100*(sig_value/evidence_val),
                               evidence_bin = case_when(
                                 evidence_val < 5 ~ "Small",
                                 between(evidence_val, 5, 9) ~ "Medium",
                                 evidence_val >= 10 ~ "Large"),
                               agreement_bin = case_when(
                                 level_agreement < 50 ~ "Low",
                                 between(level_agreement, 51, 74) ~ "Medium",
                                 level_agreement > 74 ~ "High")) |>
    
                 distinct(indicator, .keep_all = TRUE) |>
                 dplyr::select(dimension, determinant, indicator, evidence_val, evidence_bin, level_agreement) |> 
                 readr::write_csv('public/indicator_uncertainty.csv')
    
    Cee Nell's avatar
    Cee Nell committed
      tar_target(p2_indicators,
                 p1_unc_stats |>
                   distinct(dimension, determinant, indicator)
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
                 ),
      # Process census data for variables of interest
    
      # B01003_001 =  Total Population 
      # B19013_001 = Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)
      # B02001_003 =  Estimate!!Total:!!Black or African American alone
      # B03001_003 = Estimate!!Total:!!Hispanic or Latino:
      # B01001_002 = Estimate!!Total:!!Male:
      # B01001_026 = Estimate!!Total:!!Female:
    
                 list("B01003_001", "B19013_001", "B02001_003",
                      "B03001_003", "B01001_002", "B01001_026")
                 ),
    
                 get_census_data(geography = 'county', 
                                 variable = p2_census_acs5_layers,
                                 states = p1_census_states, 
                                 year = 2022, 
                                 proj = p1_proj, 
                                 survey_var = "acs5",  
                                 percent_rename = FALSE),
    
                 iteration = "list"
                 ),
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
      tar_target(p2_tot_pop,
    
                   st_drop_geometry() |>
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
                   rename(tot_pop = estimate)),
    
      # Add % of total population col to each census layer
    
      tar_target(p2_perc_census_acs5_layers_sf,
                 process_perc(tot_var = p2_census_acs5_data,
    
                              tot_pop = p2_tot_pop),
    
                 pattern = map(p2_census_acs5_data),
                 iteration = "list"),
    # Disaggregated census data
    #  The subject tables include the following geographies: nation, all states (including DC and Puerto Rico), all metropolitan areas, all congressional districts, all counties, all places and all tracts. Subject tables provide an overview of the estimates available in a particular topic. The data are presented as both counts and percentages. There are over 66,000 variables in this dataset.
    # More info here: https://api.census.gov/data/2019/acs/acs5.html 
    # load_variables(2022, "acs5/subject", cache = TRUE)
    # Age related variables 
    # S0101_C02_022 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!Under 18 years
    # S0101_C02_023 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!18 to 24 years
    # S0101_C02_024 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!15 to 44 years
    # S0101_C02_028 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!60 years and over
    
    tar_target(p2_census_acs5sub_age_layers,
    
                 c("S0101_C02_022", "S0101_C02_023", "S0101_C02_024", "S0101_C02_028")),
    tar_target(p2_census_acs5sub_age_data,
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5sub_age_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj,
                               survey_var = "acs5",  
                               percent_rename = TRUE),
    
               pattern = map(p2_census_acs5sub_age_layers),
               iteration = "list"),
    # income related variables 
    # S1901_C01_014 = Estimate!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months
    tar_target(p2_census_acs5sub_income_layers,
    
               c("S1901_C01_014")),
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5sub_income_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj, 
                               survey_var = "acs5", 
                               percent_rename = TRUE),
    
               pattern = map(p2_census_acs5sub_income_layers),
               iteration = "list"),
    # education related variables 
    # S1501_C01_003 = Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!High school graduate (includes equivalency)
    # S1501_C01_009 = Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate (includes equivalency)
    tar_target(p2_census_acs5sub_education_layers,
               c("S1501_C01_003", "S1501_C01_009")),
    tar_target(p2_census_acs5sub_education_data,
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5sub_education_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj, 
                               survey_var = "acs5", 
                               percent_rename = FALSE),
    
               iteration = "list"),
    
    
    # household and rent related variables
    # B25010_001 = Estimate!!Average household size --!!Total:Average Household Size of Occupied Housing Units by Tenure
    
    # B25064_001 = Estimate!!Median gross rent
    tar_target(p2_census_acs5_household_layers,
    
               c("B25010_001", "B25064_001")),
    
    tar_target(p2_census_acs5sub_household_data,
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5_household_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj, 
                               survey_var = "acs5", 
                               percent_rename = FALSE),
    
               pattern = map(p2_census_acs5_household_layers),
    
               iteration = "list"),
    
    # percent households variable
    # DP04_0002P = Percent!!HOUSING OCCUPANCY!!Total housing units!!Occupied housing units
    # this does not have geometry, so we will join using tigris::counties() 
    tar_target(p2_census_acs5profile_household_layers,
               c("DP04_0002P")),
    tar_target(p2_census_acs5profile_household_data,
               get_acs(geography = "county", 
                       variables = p2_census_acs5profile_household_layers, 
                       year = 2022, 
                       survey = "acs5") |> 
                 mutate(state_name = sub(".*, ", "", NAME)) |> 
                 filter(state_name %in% p1_census_states)),
    tar_target(p2_counties_sf,
    
               tigris::counties(cb = TRUE) |> 
    
                 st_transform(crs = p1_proj) |> 
                 ms_simplify(keep = 0.2)),
    # Join counties spatial to households dataframe
    tar_target(p2_census_acs5profile_household_sf,
               p2_counties_sf |> 
                 inner_join(p2_census_acs5profile_household_data, by = "GEOID")),
    
    # Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars) for white only, Black or African American Alone, American Indian and Alaska Native Alone, Asian Alone, Native Hawaiian and Other Pacific Islander Alone, Hispanic or Latino
    tar_target(p2_census_acs5_income_by_race_layers,
               c("B19013A_001", "B19013B_001", "B19013C_001", "B19013D_001", "B19013E_001", "B19013I_001")),
    tar_target(p2_census_acs5sub_income_by_race_data,
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5_income_by_race_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj, 
                               survey_var = "acs5", 
                               percent_rename = FALSE),
    
               pattern = map(p2_census_acs5_income_by_race_layers),
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
               iteration = "list"),
    # Disability status
    # S1810_C03_001: Estimate!!Percent with a disability!!Total civilian noninstitutionalized population
    # S1810_C02_001: Estimate!!With a disability!!Total civilian noninstitutionalized population
    tar_target(p2_census_acs5_disability_layers,
               c("S1810_C03_001", "S1810_C02_001")),
    tar_target(p2_census_acs5sub_disability_data,
    
               get_census_data(geography = 'county', 
                               variable = p2_census_acs5_disability_layers,
                               states = p1_census_states, 
                               year = 2022, 
                               proj = p1_proj, 
                               survey_var = "acs5", 
                               percent_rename = FALSE),
    
    Azadpour, Elmera's avatar
    Azadpour, Elmera committed
               pattern = map(p2_census_acs5_disability_layers),
    
               iteration = "list"),
    # process population density raster data
    tar_target(p2_conus_sf,
               fetch_conus_sf()),
    tar_target(p2_conus_sf_proj,
               p2_conus_sf |>  
                 st_transform(p1_proj)),
    tar_target(p2_conus_inner,
               rmapshaper::ms_innerlines(p2_conus_sf_proj)),
    tar_target(p2_pop_density_processed,
    
               process_pop_dens_raster(in_raster = p1_pop_density_raster_tif, #proj = p1_proj, 
    
                                       conus = p2_conus_sf, conus_proj = p2_conus_sf_proj,
    
                                       outfile_path = "2_process/out/pop_density.tif"),
               format = "file"),
    
    # process impervious surfaces raster data
    tar_target(p2_imp_surf_processed,
               process_imp_surf(in_raster = p1_imp_surf_tif, conus_proj = p2_conus_sf_proj,
    
                                outfile_path = "2_process/out/imp_surfaces.tif"),
               format = "file")