Our project intends to identify the factors that are associated with drug overdose and drug overdose death. We first performed a visual analysis using the national drug overdose death data.
An overview of the Top 20 states with the highest drug overdose death from 2015-2021:
# Codes for the animation above. Set eval=FALSE.
# Run previously and save as a gif a head of time to save loading time
data_2 =
sum_df %>%
group_by(year, month) %>%
arrange(year, month, -number_of_drug_overdose_deaths) %>%
mutate(rank = 1:n(),
month = match(month, month.name),
date = str_c(year,"-",month),
date = as.Date(paste(date, "-01", sep =''))) %>%
filter(rank<=20)
my_plot =
data_2 %>%
ggplot()+
aes(xmin = 0,
xmax = number_of_drug_overdose_deaths) +
aes(ymin = rank - 0.45,
ymax = rank + 0.45,
y = rank) +
facet_wrap(~ date) +
geom_rect(alpaha = 0.7) +
aes(fill = state_name)+
scale_fill_viridis_d(option = "magma",
direction = -1) +
scale_x_continuous(
limits = c(-1000, 12000),
breaks = c(-1000, 0, 3000, 6000,9000,12000)
)+
geom_text(col = "darkblue",
hjust = "right",
aes(label = state_name),
x = -100)+
geom_text(col = "darkblue",
hjust = "right",
aes(label = paste(number_of_drug_overdose_deaths),x=12000))+
scale_y_reverse()+
labs(fill = NULL)+
labs(x = "Death cases")+
labs(y = "Top 20 States")+
theme(legend.position = "none")
p = my_plot +
facet_null()+
geom_text(x = 8000, y = -10,
family = "Times",
aes(label = format(as.Date(date), "%Y-%m")),
size = 10, col = "black")+
aes(group = state_name)+
transition_time(date)
gif = animate(p,nframes = 700, fps=15,width=1000,height=700,
renderer = gifski_renderer())
gif
anim_save("animation.gif", animation = gif)
It’s an amuse bouche to start our exploration of the drug overdose topic.
We’ve found that instead of having the usual 50 states, Washington DC, and New York City, the data set also contains data for the whole US. We choose to focus on the 52 jurisdictions (including the 50 states, DC, and NYC) at first.
drug_overdose = read_csv("./data/VSRR_Provisional_Drug_Overdose_Death_Counts.csv") %>%
janitor::clean_names()
unique(pull(drug_overdose, state))
## [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "GA" "HI" "IA" "ID" "IL"
## [16] "IN" "KS" "KY" "LA" "MA" "MD" "ME" "MI" "MN" "MO" "MS" "MT" "NC" "ND" "NE"
## [31] "NH" "NJ" "NM" "NV" "NY" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "US"
## [46] "UT" "VA" "VT" "WA" "WI" "WV" "WY" "YC"
# contains 50 states, Washington DC, whole US, and the City of New York.
Some preliminary data processing
state_level = c(state.name[1:8], "District of Columbia", state.name[9:32],"New York City", state.name[33:50])
drug_overdose_52 =
drug_overdose %>%
filter(!(state_name %in% c("United States"))) %>%
relocate(state_name) %>%
mutate(month = factor(month, levels = month.name), # change month and year to factor
year = factor(year),
state_name = factor(state_name, levels = state_level)) %>%
arrange(state_name) %>%
group_by(state_name, year) %>%
mutate(month = sort(month)) # sort by month order
drug_overdose_death =
drug_overdose_52 %>%
select(-c( footnote_symbol, percent_complete, period, percent_pending_investigation, predicted_value)) %>%
filter(indicator %in% c("Number of Deaths", "Percent with drugs specified", "Number of Drug Overdose Deaths"))
drug_overdose_52 =
drug_overdose_52 %>%
mutate(low_data_quality = ifelse(str_detect(footnote, "low data quality"), 1, 0),
suppressed = ifelse(str_detect(footnote, "suppressed"), 1, 0),
underreported = ifelse(str_detect(footnote, "Underreported"), 1, 0)) %>%
relocate(footnote, .after = last_col())
In order to analyze the number of deaths in each state and the trend with time according to the type of drug, we only keep the data with the drug label(T4…) in the indicator column in the table. And we found that there are 9 states missing the specific each drug type deaths counts data: Alabama, Arkansas, Florida, Idaho, Louisiana, Minnesota, Nebraska, North Dakota, Pennsylvania.
drug_categories =
drug_overdose_52 %>%
ungroup() %>%
select(-c(state, footnote_symbol, percent_complete, period, percent_pending_investigation, footnote, predicted_value)) %>%
filter(str_detect(indicator, "T4"))
knitr::kable(drug_categories[1:3,])
state_name | year | month | indicator | data_value | low_data_quality | suppressed | underreported |
---|---|---|---|---|---|---|---|
Alaska | 2015 | January | Natural & semi-synthetic opioids (T40.2) | NA | 1 | 0 | 0 |
Alaska | 2015 | January | Natural & semi-synthetic opioids, incl. methadone (T40.2, T40.3) | NA | 1 | 0 | 0 |
Alaska | 2015 | January | Natural, semi-synthetic, & synthetic opioids, incl. methadone (T40.2-T40.4) | NA | 1 | 0 | 0 |
# missing states' data:
miss_states =
drug_overdose_52 %>%
ungroup() %>%
select(state_name) %>%
unique() %>%
filter(!(state_name %in% drug_categories$state_name))
knitr::kable(miss_states)
state_name |
---|
Alabama |
Arkansas |
Florida |
Idaho |
Louisiana |
Minnesota |
Nebraska |
North Dakota |
Pennsylvania |
drug_type_plot =
drug_overdose %>%
filter(state %in% c("US")) %>%
filter(!(indicator %in% c("Number of Deaths", "Number of Drug Overdose Deaths", "Percent with drugs specified")))%>%
relocate(state) %>%
mutate(month = factor(month, levels = month.name), # change month and year to factor
year = factor(year)) %>%
arrange(state) %>%
group_by(state, year) %>%
mutate(month = sort(month)) %>%
ggplot(aes(x = month, y = data_value,color = indicator)) +
geom_point()+
scale_x_discrete(labels = month.abb)+
facet_grid(~year)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
drug_gplt = plotly_build(drug_type_plot) %>%
layout(legend = list(orientation = "h"))
name_change = c("T40.5", "T40.1", "T40.3", "T40.2","T40.2, T40.3",
"T40.2-T40.4", "T40.0-T40.4,T40.6","T43.6", "T40.4")
name_index = c(1,8,15,22,29,36,43,50,57)
for (i in name_index){
drug_gplt$x$data[[i]]$name = name_change[which(name_index == i)]
}
drug_gplt
2015 vs. 2021
The plot shows the trend of drug overdoes deaths rate in 2015 and 2021
- This picture mainly compares the difference between the death rates of various states in 2015 and 2021. The purple line represents 2015 and the yellow line represents 2021. From the figure, we can see that the death rate of each state in 2021 is greater than 2015, and the District of Columbia changes are particularly obvious.
sum_df =
drug_overdose_death %>%
ungroup() %>%
filter(indicator %in% c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select(state_name, year, month, indicator, deaths = data_value) %>%
pivot_wider(
names_from = indicator,
values_from = deaths
) %>%
janitor::clean_names()
plot_df = sum_df %>%
group_by(state_name, year, month) %>%
ungroup() %>%
group_by(state_name, year) %>%
summarize(number_of_total_deaths = sum(number_of_deaths),
number_of_total_drug_deaths = sum(number_of_drug_overdose_deaths)) %>%
mutate(percent_overdose_death = number_of_total_drug_deaths / number_of_total_deaths) %>%
filter(year == c(2015, 2020)) %>%
ungroup()
p = ggplot(data = plot_df, aes(x = percent_overdose_death, y=reorder(state_name, percent_overdose_death)))+
geom_line(aes(group = state_name), alpha = 0.5)+
geom_point(aes(color = year), alpha = 0.6, size = 4)
ggplotly(p)
Because our dataset is provisional, the data for 2021 is not yet complete. From the figure, we can see that the data for some states are missing, so we will not analyze it here.
overview_year =
drug_overdose %>%
filter(indicator == c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select("state", "year", "month", "indicator", "data_value") %>%
filter(!(state == "US")) %>%
filter(str_detect(indicator, "Drug Overdose Deaths")) %>%
group_by(state, year, indicator) %>%
summarize(data_value = sum(data_value))
overview_year_plot =
overview_year %>%
ggplot(aes(x = year, y = data_value, color = state)) +
geom_point() +
geom_line()+
theme_set(theme_minimal() + theme(legend.position = "bottom"))
ggplotly(overview_year_plot)
In order to further and more accurately study the trend of drug overdose in the United States, we obtained the three states with the highest number of deaths each year by mapping the total number of deaths of drug overdose in each state from 2015 to 2021. At the same time, according to the geographic location of each state, the regions of the United States represented by California, Florida, NYC, and Ohio were selected for a more detailed analysis.