# Load packages
library(tidyverse)
library(here)
library(lmerTest)
library(ggplot2)

Part A:

  1. Filter for relevant columns
  2. Filter for only NG firms
  3. Aggregate 2014-2017 data

Note: This is the final consolidated version after data probing has been completed; original exploratory code can also be provided.

# lists needed for filtering
months <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
columns_needed <- c("Plant Id","Plant State", "YEAR", "Reported\nFuel Type Code", paste0("Netgen\n", months), paste0("Elec_MMBtu\n", months))

# lists needed for looping
years <- 2014:2017
temp_loop <- list()

# data aggregation loop
for (i in years) {
  file <- here("data", "powerplant", paste0(i, ".csv"))
  temp <- read.csv(file)
  
# use row 5 as column names, then drop top 5 rows
  colnames(temp) <- as.character(temp[5, ])
  temp <- temp[-(1:5), ]

# for each iteration of the loop,
  temp <- temp %>%
    select(all_of(columns_needed)) %>% # select only columns needed
    rename(Year = YEAR) %>% # rename YEAR to Year for... cleanliness?
    mutate(Year = i) %>% # raw data has some empty Year fields; ensures all Year = file name
    filter(`Reported\nFuel Type Code` == "NG") # filter for only NG firms

# adds current iteration of temp to the list position of the length of the list temp_loop +1
  temp_loop[[length(temp_loop) + 1]] <- temp 
}

# binds all rows in temp_loop into one aggregated data file
powerplant_data <- bind_rows(temp_loop)

Part B:

  1. Handle “.”
  2. Calculate heat rate
  3. Filter for the designated 7 states
# "." means missing, not 0; economically, plants could report . if data wasn't collected
# we cannot reasonably gauge these missing values, so let's mark them as NA

# replace "." with NA for any fields within the powerplant_data dataframe
powerplant_data[powerplant_data == "."] <- NA

# however, we don't want to filter them out completely since some months may be missing while other months have valuable data
# we proceed by using na.rm = TRUE to bypass the NA issue

# original data had data stored as character values with commas
# parse_number from the readr package turns chr values into dbl while ignoring all non-numerical values
# we only want to parse character values in electricity and net generation columns
powerplant_data <- powerplant_data %>%
  mutate(
# across() will look for columns starting with "Netgen" or "Elec" and parse
    across((starts_with("Netgen") | starts_with("Elec")), 
           parse_number)
  )

# originally, I went ahead and calculated heat rate by dividing the Quantity Consumed for Electricity (MMBtu) column by the Electricity Net Generation (MWh) column by month at this point

# however, later in the assignment I see that some duplicated rows are meaningful and should be summed as this assignment focuses on the firm level - therefore, I will sum them here first before calculating the heat rate.

powerplant_data <- powerplant_data %>%
  group_by(`Plant Id`, `Plant State`, Year) %>%
  summarise(
    across(starts_with("Netgen"), sum, na.rm = TRUE),
    across(starts_with("Elec"), sum, na.rm = TRUE),
    .groups = "drop"
  )

# in general, heatrate is not meaningful if netgeneration is negative; while a negative net generation can signal inefficiencies, it likely reflects a plant that is not operational. we filter out negative net generation before calculating heatrate.
# ~ = function, replace(. = placeholder for the values we are parsing, if that value is less than 0, NA)

powerplant_data <- powerplant_data %>%
  mutate(
    across(starts_with("Netgen"), ~ replace(., . < 0, NA))
    )

# the data contains 0s which will result in INF, -INF, NaN, or NA
# need to use safe divide to mitigate this

# AI Disclaimer: AI was used to type and format the code here following my prompt:
# Please expand and format my code across all 12 months (use full name) within mutate() in R: "HeatRate_month = ifelse(`Netgen\nmonth` == 0, NA, `Elec_MMBtu\nmonth` / `Netgen\nmonth`)"

heatrate <- powerplant_data %>%
  mutate(
    HeatRate_January   = ifelse(`Netgen\nJanuary`   == 0, NA, `Elec_MMBtu\nJanuary`   / `Netgen\nJanuary`),
    HeatRate_February  = ifelse(`Netgen\nFebruary`  == 0, NA, `Elec_MMBtu\nFebruary`  / `Netgen\nFebruary`),
    HeatRate_March     = ifelse(`Netgen\nMarch`     == 0, NA, `Elec_MMBtu\nMarch`     / `Netgen\nMarch`),
    HeatRate_April     = ifelse(`Netgen\nApril`     == 0, NA, `Elec_MMBtu\nApril`     / `Netgen\nApril`),
    HeatRate_May       = ifelse(`Netgen\nMay`       == 0, NA, `Elec_MMBtu\nMay`       / `Netgen\nMay`),
    HeatRate_June      = ifelse(`Netgen\nJune`      == 0, NA, `Elec_MMBtu\nJune`      / `Netgen\nJune`),
    HeatRate_July      = ifelse(`Netgen\nJuly`      == 0, NA, `Elec_MMBtu\nJuly`      / `Netgen\nJuly`),
    HeatRate_August    = ifelse(`Netgen\nAugust`    == 0, NA, `Elec_MMBtu\nAugust`    / `Netgen\nAugust`),
    HeatRate_September = ifelse(`Netgen\nSeptember` == 0, NA, `Elec_MMBtu\nSeptember` / `Netgen\nSeptember`),
    HeatRate_October   = ifelse(`Netgen\nOctober`   == 0, NA, `Elec_MMBtu\nOctober`   / `Netgen\nOctober`),
    HeatRate_November  = ifelse(`Netgen\nNovember`  == 0, NA, `Elec_MMBtu\nNovember`  / `Netgen\nNovember`),
    HeatRate_December  = ifelse(`Netgen\nDecember`  == 0, NA, `Elec_MMBtu\nDecember`  / `Netgen\nDecember`)
  )

# create a new simplified dataframe by selecting:
# Plant Id, Plant State, Year, Net Generation, and Heat Rate columns.

simplified_df <- heatrate %>%
  select(
    "Plant Id","Plant State", "Year", paste0("Netgen\n", months), paste0("HeatRate_", months) 
  )

# filter for TX, WA, MA, CA, AZ, KS (only 6 states here) added NY
output_B <- simplified_df %>%
  filter(`Plant State` %in% c("TX", "WA", "MA", "CA", "AZ", "KS", "NY"))
# this is a check to see if any negative heatrates exist in the data

heatrate_check <- output_B %>%
  summarise(
    total = sum(!is.na(c_across(starts_with("HeatRate")))), # sum all non-empty rows with columns starting with HeatRate
    negatives = sum(c_across(starts_with("HeatRate")) < 0, na.rm = TRUE),
    positives = sum(c_across(starts_with("HeatRate")) >= 0, na.rm = TRUE)
  ) %>%
  mutate(percent_negative = 100 * negatives / total)

heatrate_check
## # A tibble: 1 × 4
##   total negatives positives percent_negative
##   <int>     <int>     <int>            <dbl>
## 1 33775         0     33775                0

Part C:

  1. antijoin excluded.csv
  2. filter out very small (NetGen < 0.5 million MWH) and very large firms (>5 million MWH)
  3. filter out heat rate < 6 for large modern plant
  4. filter out heat rate > 16 for older plant
  5. group data by Plant Id, Plant_State , Year using the max function within summarise() for this part, I opted to sum instead - see reasoning in the code below.
  6. group the data by state, and then average the calculated heat rate across months, years and firms for one entry per state.
# start with importing excludeDF.csv and renaming its columns to match final_B
excludeDF <- read.csv(here("data", "excludeDF.csv")) %>%
  rename(`Plant Id` = plant_id) %>%
  rename(`Plant State` = plant_state) %>%
  rename(Year = year)

# antijoin excludeDF from final_B
excluded_B <- anti_join(
  # the two `Plant Id` types didn't match, so we convert both to chr
  output_B %>% mutate(`Plant Id` = as.character(`Plant Id`)), 
  excludeDF %>% mutate(`Plant Id` = as.character(`Plant Id`)), 
  by = c("Plant Id", "Plant State", "Year")
)

# reasoning for why I summed the data earlier on in the workflow is listed below:

# first, we need to deal with multiple entries per firm before filtering netgen sums
# looking further into the data, some firms have multiple rows due to differing prime movers
# most firms with a second row containing very small data have the same prime mover
# for accuracy sake, I opt to summarize using sum rather than max

# filter out very small (NetGen < 0.5 million MWH) and very large firms (>5 million MWH)
kept_netgen <- excluded_B %>%
  rowwise() %>%
  mutate(Netgen_Sum = sum(c_across(starts_with("Netgen")), na.rm = TRUE)) %>%
  ungroup() %>%
  filter(Netgen_Sum >= 500000 & Netgen_Sum <= 5000000)

# filter out heat rate violating < 6 > 16 as a whole; no need to identify which plants
kept_heatrate <- kept_netgen %>%
  rowwise() %>% # calculate average heat rate per row
  mutate(Mean_HeatRate = mean(c_across(starts_with("HeatRate")), na.rm = TRUE)) %>%
  ungroup() %>%
  filter(Mean_HeatRate >= 6 & Mean_HeatRate <= 16)

# group the data by state, and then average the calculated heat rate across months, years and firms. 
# we already calculated the monthly average per year per firm
# step 1: calculate the heat rate across years per firm
firm_mean <- kept_heatrate %>%
  group_by(`Plant Id`, `Plant State`) %>%
  summarise(
    Firm_Mean_HeatRate = mean(Mean_HeatRate, na.rm = TRUE),
    .groups = "drop"
  )

# step 2: average across all firms per state
state_mean <- firm_mean %>%
  group_by(`Plant State`) %>%
  summarise(
    State_Mean_HeatRate = mean(Firm_Mean_HeatRate, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  rename(State = `Plant State`) %>%
  rename(HeatRate = `State_Mean_HeatRate`)

state_mean
## # A tibble: 7 × 2
##   State HeatRate
##   <chr>    <dbl>
## 1 AZ        8.03
## 2 CA        8.34
## 3 KS        8.84
## 4 MA        7.97
## 5 NY        8.78
## 6 TX        9.24
## 7 WA        7.73

Part D:

  1. download EIA files containing daily wholesale electricity & natural gas prices for 2014-2017
  2. import and create two dfs
  3. filter for the states included in the analysis
  4. join the two dfs using a dictionary
  5. clean the dfs’ duplicate entries
  6. left_join with state_mean for final analysis
# electricity data
# load in data and bind into one large df
electricity_2014 <- read.csv(here("data", "electricity", "2014_electricity.csv"))
electricity_2015 <- read.csv(here("data", "electricity", "2015_electricity.csv"))
electricity_2016 <- read.csv(here("data", "electricity", "2016_electricity.csv"))
electricity_2017 <- read.csv(here("data", "electricity", "2017_electricity.csv"))

electricity_price <- rbind(electricity_2014, electricity_2015, electricity_2016, electricity_2017)

# rename columns, select only needed columns
electricity_price <- electricity_price %>%
  rename(Weighted_Avg_Price = Wtd.avg.price...MWh) %>%
  rename(Date = Trade.date) %>%
  rename(Hub = Price.hub) %>%
  select(
    Hub, Date, Weighted_Avg_Price, Delivery.start.date #using delivery.start.date to check date format errors
  )

# date format correction
electricity_price <- electricity_price %>%
  mutate(Date = case_when(
    Date == "01-12-17" ~ "2017-01-12", 
    Date == "12-01-14" ~ "2014-12-01",
    TRUE ~ Date
  )) %>%
  select(-Delivery.start.date)

# natural gas data
# load in data and bind into one large df
ng_2014 <- read.csv(here("data", "ng", "2014_ng.csv"))
ng_2015 <- read.csv(here("data", "ng", "2015_ng.csv"))
ng_2016 <- read.csv(here("data", "ng", "2016_ng.csv"))
ng_2017 <- read.csv(here("data", "ng", "2017_ng.csv"))

ng_price <- rbind(ng_2014, ng_2015, ng_2016, ng_2017)

# rename columns, select only needed columns
ng_price <- ng_price %>%
  rename(Weighted_Avg_Price = Wtd.avg.price...MMBtu) %>%
  rename(Date = Trade.date) %>%
  rename(Hub = Price.hub) %>%
  select(
    Hub, Date, Weighted_Avg_Price
  )

# drop two NA rows (note that 2014 data starts in March)
ng_price <- ng_price[complete.cases(ng_price), ]

# create a dictionary to look up state vs hub
state_hub <- c(
  "ERCOT North 345KV Peak" = "TX",
  "Henry" = "TX",
  "Indiana Hub RT Peak" = "KS",
  "Chicago Citygates" = "KS",
  "Mid C Peak" = "WA",
  "Malin" = "WA",
  "Nepool MH DA LMP Peak" = "MA",
  "Algonquin Citygates" = "MA",
  "NP15 EZ Gen DA LMP Peak" = "CA",
  "PG&E - Citygate" = "CA",
  "Palo Verde Peak" = "AZ",
  "Socal-Ehrenberg" = "AZ",
  "PJM WH Real Time Peak" = "NY",
  "TETCO-M3" = "NY"
)

# electricity: add a column for hubs that matches the state
electricity_price$State <- state_hub[electricity_price$Hub]
electricity_price_filtered <- electricity_price %>% 
  filter(!is.na(State))
# check if all states are present
unique(electricity_price_filtered$State)
## [1] "TX" "KS" "WA" "MA" "CA" "AZ" "NY"
# ng: add a column for hubs that matches the state
ng_price$State <- state_hub[ng_price$Hub]
ng_price_filtered <- ng_price %>% filter(!is.na(State))
# check if all states are present
unique(ng_price_filtered$State)
## [1] "MA" "KS" "TX" "WA" "CA" "AZ" "NY"
# found CA formatted as "PG&E - Citygate", corrected

# since we always take the mean if duplicates exist, just summarise
unique_elec <- electricity_price_filtered %>%
  group_by(State, Hub, Date) %>%
  summarise(Weighted_Avg_Price = mean(Weighted_Avg_Price, na.rm = TRUE)) %>%
  ungroup()

unique_ng <- ng_price_filtered %>%
  group_by(State, Hub, Date) %>%
  summarise(Weighted_Avg_Price = mean(Weighted_Avg_Price, na.rm = TRUE)) %>%
  ungroup()

# group the data by state, and then average the calculated heat rate across months then years (since firms are unique). 
library(lubridate)
# electricity
# step 1 average across months
elec_month_mean <- unique_elec %>%
  mutate(
    Year  = year(as.Date(Date)),
    Month = month(as.Date(Date)) 
  ) %>%
  group_by(State, Hub, Year, Month) %>%
  summarise(
    Month_Mean_Price = mean(Weighted_Avg_Price, na.rm = TRUE),
    .groups = "drop"
  )

# step 2 average across year
elec_year_mean <- elec_month_mean %>%
  group_by(State, Hub, Year) %>%
  summarise(
    Year_Mean_Price = mean(Month_Mean_Price, na.rm = TRUE),
    .groups = "drop"
  )

# step 3 average price overall
elec_mean <- elec_year_mean %>%
  group_by(State, Hub) %>%
  summarise(
    Elec_Price = mean(Year_Mean_Price, na.rm = TRUE),
    .groups = "drop"
  )

# ng
# step 1 average across months
ng_month_mean <- unique_ng %>%
  mutate(
    Year  = year(as.Date(Date)),
    Month = month(as.Date(Date)) 
  ) %>%
  group_by(State, Hub, Year, Month) %>%
  summarise(
    Month_Mean_Price = mean(Weighted_Avg_Price, na.rm = TRUE),
    .groups = "drop"
  )

# step 2 average across year
ng_year_mean <- ng_month_mean %>%
  group_by(State, Hub, Year) %>%
  summarise(
    Year_Mean_Price = mean(Month_Mean_Price, na.rm = TRUE),
    .groups = "drop"
  )

# step 3 average price overall
ng_mean <- ng_year_mean %>%
  group_by(State, Hub) %>%
  summarise(
    NG_Price = mean(Year_Mean_Price, na.rm = TRUE),
    .groups = "drop"
  )

# joining altogether
analysis <- state_mean %>%
  left_join(ng_mean,   by = "State") %>%
  left_join(elec_mean, by = "State") %>%
  select(-Hub.y, -Hub.x)

# final sparkspread calculation
analysis <- analysis %>%
  mutate(
    SparkSpread = Elec_Price - HeatRate * NG_Price
  )

# estimating the mean and standard deviation of the spark spread
stats <- analysis %>%
  summarise(
    Mean_SparkSpread = mean(SparkSpread, na.rm = TRUE),
    SD_SparkSpread   = sd(SparkSpread, na.rm = TRUE)
  )

analysis
## # A tibble: 7 × 5
##   State HeatRate NG_Price Elec_Price SparkSpread
##   <chr>    <dbl>    <dbl>      <dbl>       <dbl>
## 1 AZ        8.03     3.08       31.4        6.65
## 2 CA        8.34     3.40       41.4       13.0 
## 3 KS        8.84     3.10       36.5        9.10
## 4 MA        7.97     4.39       42.7        7.73
## 5 NY        8.78     2.51       40.0       18.0 
## 6 TX        9.24     3.05       31.6        3.37
## 7 WA        7.73     2.94       27.7        4.95
stats
## # A tibble: 1 × 2
##   Mean_SparkSpread SD_SparkSpread
##              <dbl>          <dbl>
## 1             8.96           5.02

Visualization

# plot the spread for the seven states on the same graph
# then conduct a visual assessment of profitability based on location
plot <- analysis %>%
  ggplot(aes(x = State, y = SparkSpread, color = State)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Spark Spread by State",
       y = "Spark Spread ($/MWh)", x = "US States") +
  theme_minimal() + 
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5),
    plot.margin = margin(t = 10, r = 5, b = 5, l = 10),
    axis.title.x = element_text(margin = margin(t = 10)),  # push x title down
    axis.title.y = element_text(margin = margin(r = 5))   # push y title left
  )
plot

# visually identify if the spark spread has a seasonal component
# this one will need some type of relationship with... calculating spark spread per season
# or just a line plot by month

# seasonal electricity
# monthly elec prices = elec_month_mean
# monthly ng prices = ng_month_mean
# we'll use a consistent heatrate for simplicity sake
# calculate monthly spark spread by state

temp_elec <- elec_month_mean %>%
  select(-Year, -Hub) %>%
  rename(elec_month_price = Month_Mean_Price) %>%
  group_by(State, Month) %>%
  summarize(elec_month_price=mean(elec_month_price))

temp_ng <- ng_month_mean %>%
  select(-Year, -Hub) %>%
  rename(ng_month_price = Month_Mean_Price) %>%
  group_by(State, Month) %>%
  summarize(ng_month_price=mean(ng_month_price))

temp_ss <- state_mean %>%
  left_join(temp_ng,   by = c("State")) %>%
  left_join(temp_elec, by = c("State", "Month"))

month_ss <- temp_ss %>%
  rowwise() %>%
  mutate(
    monthly_SparkSpread = elec_month_price - HeatRate * ng_month_price
  ) %>%
  mutate(Month = factor(Month, levels = 1:12, labels = month.abb))

plot2 <- month_ss %>%
  ggplot(aes(x = Month, y = monthly_SparkSpread, color = State, group = State)) +
  geom_line(linewidth = 1) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Monthly Spark Spread by State",
    x = "Month",
    y = "Spark Spread ($/MWh)"
  ) +
  theme_minimal()
plot2

seasonal_ss <- month_ss %>%
  mutate(
    Season = case_when(
      Month %in% c("Dec","Jan","Feb") ~ "Winter",
      Month %in% c("Mar","Apr","May") ~ "Spring",
      Month %in% c("Jun","Jul","Aug") ~ "Summer",
      Month %in% c("Sep","Oct","Nov") ~ "Fall"
    ),
    Season = factor(Season, levels = c("Winter","Spring","Summer","Fall"))
  ) %>%
  group_by(State, Season) %>%
  summarise(
    Mean_SS = mean(monthly_SparkSpread, na.rm = TRUE),
    .groups = "drop"
  )

plot3 <- seasonal_ss %>%
  ggplot(aes(x = Season, y = Mean_SS, color = State, group = State)) +
  geom_line(linewidth = 1) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Seasonal Spark Spread by State",
    x = "Season",
    y = "Spark Spread ($/MWh)"
  ) +
  theme_minimal()

plot3