# Load packages
library(tidyverse)
library(here)
library(lmerTest)
library(ggplot2)
Note: This is the final consolidated version after data probing has been completed; original exploratory code can also be provided.
# lists needed for filtering
months <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
columns_needed <- c("Plant Id","Plant State", "YEAR", "Reported\nFuel Type Code", paste0("Netgen\n", months), paste0("Elec_MMBtu\n", months))
# lists needed for looping
years <- 2014:2017
temp_loop <- list()
# data aggregation loop
for (i in years) {
file <- here("data", "powerplant", paste0(i, ".csv"))
temp <- read.csv(file)
# use row 5 as column names, then drop top 5 rows
colnames(temp) <- as.character(temp[5, ])
temp <- temp[-(1:5), ]
# for each iteration of the loop,
temp <- temp %>%
select(all_of(columns_needed)) %>% # select only columns needed
rename(Year = YEAR) %>% # rename YEAR to Year for... cleanliness?
mutate(Year = i) %>% # raw data has some empty Year fields; ensures all Year = file name
filter(`Reported\nFuel Type Code` == "NG") # filter for only NG firms
# adds current iteration of temp to the list position of the length of the list temp_loop +1
temp_loop[[length(temp_loop) + 1]] <- temp
}
# binds all rows in temp_loop into one aggregated data file
powerplant_data <- bind_rows(temp_loop)
# "." means missing, not 0; economically, plants could report . if data wasn't collected
# we cannot reasonably gauge these missing values, so let's mark them as NA
# replace "." with NA for any fields within the powerplant_data dataframe
powerplant_data[powerplant_data == "."] <- NA
# however, we don't want to filter them out completely since some months may be missing while other months have valuable data
# we proceed by using na.rm = TRUE to bypass the NA issue
# original data had data stored as character values with commas
# parse_number from the readr package turns chr values into dbl while ignoring all non-numerical values
# we only want to parse character values in electricity and net generation columns
powerplant_data <- powerplant_data %>%
mutate(
# across() will look for columns starting with "Netgen" or "Elec" and parse
across((starts_with("Netgen") | starts_with("Elec")),
parse_number)
)
# originally, I went ahead and calculated heat rate by dividing the Quantity Consumed for Electricity (MMBtu) column by the Electricity Net Generation (MWh) column by month at this point
# however, later in the assignment I see that some duplicated rows are meaningful and should be summed as this assignment focuses on the firm level - therefore, I will sum them here first before calculating the heat rate.
powerplant_data <- powerplant_data %>%
group_by(`Plant Id`, `Plant State`, Year) %>%
summarise(
across(starts_with("Netgen"), sum, na.rm = TRUE),
across(starts_with("Elec"), sum, na.rm = TRUE),
.groups = "drop"
)
# in general, heatrate is not meaningful if netgeneration is negative; while a negative net generation can signal inefficiencies, it likely reflects a plant that is not operational. we filter out negative net generation before calculating heatrate.
# ~ = function, replace(. = placeholder for the values we are parsing, if that value is less than 0, NA)
powerplant_data <- powerplant_data %>%
mutate(
across(starts_with("Netgen"), ~ replace(., . < 0, NA))
)
# the data contains 0s which will result in INF, -INF, NaN, or NA
# need to use safe divide to mitigate this
# AI Disclaimer: AI was used to type and format the code here following my prompt:
# Please expand and format my code across all 12 months (use full name) within mutate() in R: "HeatRate_month = ifelse(`Netgen\nmonth` == 0, NA, `Elec_MMBtu\nmonth` / `Netgen\nmonth`)"
heatrate <- powerplant_data %>%
mutate(
HeatRate_January = ifelse(`Netgen\nJanuary` == 0, NA, `Elec_MMBtu\nJanuary` / `Netgen\nJanuary`),
HeatRate_February = ifelse(`Netgen\nFebruary` == 0, NA, `Elec_MMBtu\nFebruary` / `Netgen\nFebruary`),
HeatRate_March = ifelse(`Netgen\nMarch` == 0, NA, `Elec_MMBtu\nMarch` / `Netgen\nMarch`),
HeatRate_April = ifelse(`Netgen\nApril` == 0, NA, `Elec_MMBtu\nApril` / `Netgen\nApril`),
HeatRate_May = ifelse(`Netgen\nMay` == 0, NA, `Elec_MMBtu\nMay` / `Netgen\nMay`),
HeatRate_June = ifelse(`Netgen\nJune` == 0, NA, `Elec_MMBtu\nJune` / `Netgen\nJune`),
HeatRate_July = ifelse(`Netgen\nJuly` == 0, NA, `Elec_MMBtu\nJuly` / `Netgen\nJuly`),
HeatRate_August = ifelse(`Netgen\nAugust` == 0, NA, `Elec_MMBtu\nAugust` / `Netgen\nAugust`),
HeatRate_September = ifelse(`Netgen\nSeptember` == 0, NA, `Elec_MMBtu\nSeptember` / `Netgen\nSeptember`),
HeatRate_October = ifelse(`Netgen\nOctober` == 0, NA, `Elec_MMBtu\nOctober` / `Netgen\nOctober`),
HeatRate_November = ifelse(`Netgen\nNovember` == 0, NA, `Elec_MMBtu\nNovember` / `Netgen\nNovember`),
HeatRate_December = ifelse(`Netgen\nDecember` == 0, NA, `Elec_MMBtu\nDecember` / `Netgen\nDecember`)
)
# create a new simplified dataframe by selecting:
# Plant Id, Plant State, Year, Net Generation, and Heat Rate columns.
simplified_df <- heatrate %>%
select(
"Plant Id","Plant State", "Year", paste0("Netgen\n", months), paste0("HeatRate_", months)
)
# filter for TX, WA, MA, CA, AZ, KS (only 6 states here) added NY
output_B <- simplified_df %>%
filter(`Plant State` %in% c("TX", "WA", "MA", "CA", "AZ", "KS", "NY"))
# this is a check to see if any negative heatrates exist in the data
heatrate_check <- output_B %>%
summarise(
total = sum(!is.na(c_across(starts_with("HeatRate")))), # sum all non-empty rows with columns starting with HeatRate
negatives = sum(c_across(starts_with("HeatRate")) < 0, na.rm = TRUE),
positives = sum(c_across(starts_with("HeatRate")) >= 0, na.rm = TRUE)
) %>%
mutate(percent_negative = 100 * negatives / total)
heatrate_check
## # A tibble: 1 × 4
## total negatives positives percent_negative
## <int> <int> <int> <dbl>
## 1 33775 0 33775 0
# start with importing excludeDF.csv and renaming its columns to match final_B
excludeDF <- read.csv(here("data", "excludeDF.csv")) %>%
rename(`Plant Id` = plant_id) %>%
rename(`Plant State` = plant_state) %>%
rename(Year = year)
# antijoin excludeDF from final_B
excluded_B <- anti_join(
# the two `Plant Id` types didn't match, so we convert both to chr
output_B %>% mutate(`Plant Id` = as.character(`Plant Id`)),
excludeDF %>% mutate(`Plant Id` = as.character(`Plant Id`)),
by = c("Plant Id", "Plant State", "Year")
)
# reasoning for why I summed the data earlier on in the workflow is listed below:
# first, we need to deal with multiple entries per firm before filtering netgen sums
# looking further into the data, some firms have multiple rows due to differing prime movers
# most firms with a second row containing very small data have the same prime mover
# for accuracy sake, I opt to summarize using sum rather than max
# filter out very small (NetGen < 0.5 million MWH) and very large firms (>5 million MWH)
kept_netgen <- excluded_B %>%
rowwise() %>%
mutate(Netgen_Sum = sum(c_across(starts_with("Netgen")), na.rm = TRUE)) %>%
ungroup() %>%
filter(Netgen_Sum >= 500000 & Netgen_Sum <= 5000000)
# filter out heat rate violating < 6 > 16 as a whole; no need to identify which plants
kept_heatrate <- kept_netgen %>%
rowwise() %>% # calculate average heat rate per row
mutate(Mean_HeatRate = mean(c_across(starts_with("HeatRate")), na.rm = TRUE)) %>%
ungroup() %>%
filter(Mean_HeatRate >= 6 & Mean_HeatRate <= 16)
# group the data by state, and then average the calculated heat rate across months, years and firms.
# we already calculated the monthly average per year per firm
# step 1: calculate the heat rate across years per firm
firm_mean <- kept_heatrate %>%
group_by(`Plant Id`, `Plant State`) %>%
summarise(
Firm_Mean_HeatRate = mean(Mean_HeatRate, na.rm = TRUE),
.groups = "drop"
)
# step 2: average across all firms per state
state_mean <- firm_mean %>%
group_by(`Plant State`) %>%
summarise(
State_Mean_HeatRate = mean(Firm_Mean_HeatRate, na.rm = TRUE),
.groups = "drop"
) %>%
rename(State = `Plant State`) %>%
rename(HeatRate = `State_Mean_HeatRate`)
state_mean
## # A tibble: 7 × 2
## State HeatRate
## <chr> <dbl>
## 1 AZ 8.03
## 2 CA 8.34
## 3 KS 8.84
## 4 MA 7.97
## 5 NY 8.78
## 6 TX 9.24
## 7 WA 7.73
# electricity data
# load in data and bind into one large df
electricity_2014 <- read.csv(here("data", "electricity", "2014_electricity.csv"))
electricity_2015 <- read.csv(here("data", "electricity", "2015_electricity.csv"))
electricity_2016 <- read.csv(here("data", "electricity", "2016_electricity.csv"))
electricity_2017 <- read.csv(here("data", "electricity", "2017_electricity.csv"))
electricity_price <- rbind(electricity_2014, electricity_2015, electricity_2016, electricity_2017)
# rename columns, select only needed columns
electricity_price <- electricity_price %>%
rename(Weighted_Avg_Price = Wtd.avg.price...MWh) %>%
rename(Date = Trade.date) %>%
rename(Hub = Price.hub) %>%
select(
Hub, Date, Weighted_Avg_Price, Delivery.start.date #using delivery.start.date to check date format errors
)
# date format correction
electricity_price <- electricity_price %>%
mutate(Date = case_when(
Date == "01-12-17" ~ "2017-01-12",
Date == "12-01-14" ~ "2014-12-01",
TRUE ~ Date
)) %>%
select(-Delivery.start.date)
# natural gas data
# load in data and bind into one large df
ng_2014 <- read.csv(here("data", "ng", "2014_ng.csv"))
ng_2015 <- read.csv(here("data", "ng", "2015_ng.csv"))
ng_2016 <- read.csv(here("data", "ng", "2016_ng.csv"))
ng_2017 <- read.csv(here("data", "ng", "2017_ng.csv"))
ng_price <- rbind(ng_2014, ng_2015, ng_2016, ng_2017)
# rename columns, select only needed columns
ng_price <- ng_price %>%
rename(Weighted_Avg_Price = Wtd.avg.price...MMBtu) %>%
rename(Date = Trade.date) %>%
rename(Hub = Price.hub) %>%
select(
Hub, Date, Weighted_Avg_Price
)
# drop two NA rows (note that 2014 data starts in March)
ng_price <- ng_price[complete.cases(ng_price), ]
# create a dictionary to look up state vs hub
state_hub <- c(
"ERCOT North 345KV Peak" = "TX",
"Henry" = "TX",
"Indiana Hub RT Peak" = "KS",
"Chicago Citygates" = "KS",
"Mid C Peak" = "WA",
"Malin" = "WA",
"Nepool MH DA LMP Peak" = "MA",
"Algonquin Citygates" = "MA",
"NP15 EZ Gen DA LMP Peak" = "CA",
"PG&E - Citygate" = "CA",
"Palo Verde Peak" = "AZ",
"Socal-Ehrenberg" = "AZ",
"PJM WH Real Time Peak" = "NY",
"TETCO-M3" = "NY"
)
# electricity: add a column for hubs that matches the state
electricity_price$State <- state_hub[electricity_price$Hub]
electricity_price_filtered <- electricity_price %>%
filter(!is.na(State))
# check if all states are present
unique(electricity_price_filtered$State)
## [1] "TX" "KS" "WA" "MA" "CA" "AZ" "NY"
# ng: add a column for hubs that matches the state
ng_price$State <- state_hub[ng_price$Hub]
ng_price_filtered <- ng_price %>% filter(!is.na(State))
# check if all states are present
unique(ng_price_filtered$State)
## [1] "MA" "KS" "TX" "WA" "CA" "AZ" "NY"
# found CA formatted as "PG&E - Citygate", corrected
# since we always take the mean if duplicates exist, just summarise
unique_elec <- electricity_price_filtered %>%
group_by(State, Hub, Date) %>%
summarise(Weighted_Avg_Price = mean(Weighted_Avg_Price, na.rm = TRUE)) %>%
ungroup()
unique_ng <- ng_price_filtered %>%
group_by(State, Hub, Date) %>%
summarise(Weighted_Avg_Price = mean(Weighted_Avg_Price, na.rm = TRUE)) %>%
ungroup()
# group the data by state, and then average the calculated heat rate across months then years (since firms are unique).
library(lubridate)
# electricity
# step 1 average across months
elec_month_mean <- unique_elec %>%
mutate(
Year = year(as.Date(Date)),
Month = month(as.Date(Date))
) %>%
group_by(State, Hub, Year, Month) %>%
summarise(
Month_Mean_Price = mean(Weighted_Avg_Price, na.rm = TRUE),
.groups = "drop"
)
# step 2 average across year
elec_year_mean <- elec_month_mean %>%
group_by(State, Hub, Year) %>%
summarise(
Year_Mean_Price = mean(Month_Mean_Price, na.rm = TRUE),
.groups = "drop"
)
# step 3 average price overall
elec_mean <- elec_year_mean %>%
group_by(State, Hub) %>%
summarise(
Elec_Price = mean(Year_Mean_Price, na.rm = TRUE),
.groups = "drop"
)
# ng
# step 1 average across months
ng_month_mean <- unique_ng %>%
mutate(
Year = year(as.Date(Date)),
Month = month(as.Date(Date))
) %>%
group_by(State, Hub, Year, Month) %>%
summarise(
Month_Mean_Price = mean(Weighted_Avg_Price, na.rm = TRUE),
.groups = "drop"
)
# step 2 average across year
ng_year_mean <- ng_month_mean %>%
group_by(State, Hub, Year) %>%
summarise(
Year_Mean_Price = mean(Month_Mean_Price, na.rm = TRUE),
.groups = "drop"
)
# step 3 average price overall
ng_mean <- ng_year_mean %>%
group_by(State, Hub) %>%
summarise(
NG_Price = mean(Year_Mean_Price, na.rm = TRUE),
.groups = "drop"
)
# joining altogether
analysis <- state_mean %>%
left_join(ng_mean, by = "State") %>%
left_join(elec_mean, by = "State") %>%
select(-Hub.y, -Hub.x)
# final sparkspread calculation
analysis <- analysis %>%
mutate(
SparkSpread = Elec_Price - HeatRate * NG_Price
)
# estimating the mean and standard deviation of the spark spread
stats <- analysis %>%
summarise(
Mean_SparkSpread = mean(SparkSpread, na.rm = TRUE),
SD_SparkSpread = sd(SparkSpread, na.rm = TRUE)
)
analysis
## # A tibble: 7 × 5
## State HeatRate NG_Price Elec_Price SparkSpread
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AZ 8.03 3.08 31.4 6.65
## 2 CA 8.34 3.40 41.4 13.0
## 3 KS 8.84 3.10 36.5 9.10
## 4 MA 7.97 4.39 42.7 7.73
## 5 NY 8.78 2.51 40.0 18.0
## 6 TX 9.24 3.05 31.6 3.37
## 7 WA 7.73 2.94 27.7 4.95
stats
## # A tibble: 1 × 2
## Mean_SparkSpread SD_SparkSpread
## <dbl> <dbl>
## 1 8.96 5.02
# plot the spread for the seven states on the same graph
# then conduct a visual assessment of profitability based on location
plot <- analysis %>%
ggplot(aes(x = State, y = SparkSpread, color = State)) +
geom_point(size = 3, alpha = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Spark Spread by State",
y = "Spark Spread ($/MWh)", x = "US States") +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.margin = margin(t = 10, r = 5, b = 5, l = 10),
axis.title.x = element_text(margin = margin(t = 10)), # push x title down
axis.title.y = element_text(margin = margin(r = 5)) # push y title left
)
plot
# visually identify if the spark spread has a seasonal component
# this one will need some type of relationship with... calculating spark spread per season
# or just a line plot by month
# seasonal electricity
# monthly elec prices = elec_month_mean
# monthly ng prices = ng_month_mean
# we'll use a consistent heatrate for simplicity sake
# calculate monthly spark spread by state
temp_elec <- elec_month_mean %>%
select(-Year, -Hub) %>%
rename(elec_month_price = Month_Mean_Price) %>%
group_by(State, Month) %>%
summarize(elec_month_price=mean(elec_month_price))
temp_ng <- ng_month_mean %>%
select(-Year, -Hub) %>%
rename(ng_month_price = Month_Mean_Price) %>%
group_by(State, Month) %>%
summarize(ng_month_price=mean(ng_month_price))
temp_ss <- state_mean %>%
left_join(temp_ng, by = c("State")) %>%
left_join(temp_elec, by = c("State", "Month"))
month_ss <- temp_ss %>%
rowwise() %>%
mutate(
monthly_SparkSpread = elec_month_price - HeatRate * ng_month_price
) %>%
mutate(Month = factor(Month, levels = 1:12, labels = month.abb))
plot2 <- month_ss %>%
ggplot(aes(x = Month, y = monthly_SparkSpread, color = State, group = State)) +
geom_line(linewidth = 1) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Monthly Spark Spread by State",
x = "Month",
y = "Spark Spread ($/MWh)"
) +
theme_minimal()
plot2
seasonal_ss <- month_ss %>%
mutate(
Season = case_when(
Month %in% c("Dec","Jan","Feb") ~ "Winter",
Month %in% c("Mar","Apr","May") ~ "Spring",
Month %in% c("Jun","Jul","Aug") ~ "Summer",
Month %in% c("Sep","Oct","Nov") ~ "Fall"
),
Season = factor(Season, levels = c("Winter","Spring","Summer","Fall"))
) %>%
group_by(State, Season) %>%
summarise(
Mean_SS = mean(monthly_SparkSpread, na.rm = TRUE),
.groups = "drop"
)
plot3 <- seasonal_ss %>%
ggplot(aes(x = Season, y = Mean_SS, color = State, group = State)) +
geom_line(linewidth = 1) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Seasonal Spark Spread by State",
x = "Season",
y = "Spark Spread ($/MWh)"
) +
theme_minimal()
plot3