Re-running frailty models currently set to off
Main Analyses
Last run October 25, 2024 at 2:54PM-3:00PM (check modified date saved results)
# functions used for each cancer type
get_data<-function(cancer_type) {
mort<- haven::read_dta("data/mort.dta") |>
dplyr::filter(cancer_site1==cancer_type) |>
haven::as_factor() |>
dplyr::mutate(stage = relevel(as.factor(stage), 3)) |> # set base stage II
dplyr::mutate(agedec = agedec-mean(agedec)) |>
dplyr::mutate(pvgroup = relevel(as.factor(pvgroup), 5)) |> # set base none
dplyr::mutate(grade = relevel(as.factor(grade),2)) |>
dplyr::mutate(race = relevel(race,4)) |>
dplyr::mutate(rurality = relevel(rurality,1)) |>
dplyr::mutate(poverty = relevel(poverty,3))
return(mort)
}
get_fixed<-function(data,covar) {
outcome<-"survival::Surv(survtime, event)"
f<-as.formula(paste(outcome,paste(covar,collapse="+"),sep="~"))
m_fixed<-survival::coxph(f, data, ties = "breslow")
return(m_fixed)
}
get_random<-function(data,can_type,covar) {
library(coxme)
outcome<-"survival::Surv(survtime, event)"
f<-as.formula(paste(outcome,paste(paste(covar[-length(covar)],collapse="+"),
"cluster(pvgroup)", sep="+"),sep="~"))
m_emfrail<-frailtyEM::emfrail(f, data)
save_file<-paste0("data/","m_emfrail",can_type,".Rdata")
save(m_emfrail,file=save_file)
return(m_emfrail)
}
process_fixed<-function(data=m_fixed) {
library(tidyverse)
# Obtain center estimates and calculate standard error
pvgroup <- grep("pvgroup", names(m_fixed$coefficients))
nc <- length(pvgroup)
var_pvgroup <- m_fixed$var[pvgroup, pvgroup]
varmean <- sum(var_pvgroup)/ nc^2
newse <- sqrt(c(varmean, diag(var_pvgroup) + varmean - 2/nc * apply(var_pvgroup, 1, sum)))
# change contrast for hr's to average
fe<-data.frame(beta = c(pvgroup5 = 0, m_fixed$coefficients[pvgroup]), sd = newse) |>
rownames_to_column("pvgroup") |>
mutate(pvgroup = substr(pvgroup,8,25)) |>
mutate( pvgroup = base::replace(pvgroup, pvgroup=="5", "Negative")) |>
mutate(sbeta = sum(beta) / nc) |>
mutate(beta = beta - sbeta) |>
arrange(beta) |>
mutate(ord = 1:n()) |>
mutate(pvgroup = as.character(pvgroup)) |>
mutate(ymin = beta - 1.96 * sd, ymax = beta + 1.96 * sd) |>
rename(id = pvgroup, estimate = beta) |>
select(-sd, -sbeta, -ord) |>
mutate(type = "fixed effects") |>
mutate(ord = 1:n())
return(fe)
}
process_random <- function(data=m_emfrail) {
library(data.table)
frailties<-as.data.table(frailtyEM:::summary.emfrail(data)[[12]])
logz <- frailties[order(z),.(id,z,lower_q, upper_q)] |>
dplyr::mutate_if(is.numeric, log) |>
dplyr::rename(estimate = z, ymin = lower_q, ymax = upper_q) |>
dplyr::mutate(type = "frailty") |>
dplyr::mutate(ord = 1:dplyr::n())
return(logz)
}
combine_effects<-function(fe,logz) {
ord_re <- data.frame(id = logz$id, ord = logz$ord)
datt <- fe |>
right_join(ord_re,join_by(id==id)) |>
select(-ord.y) |> rename(ord=ord.x) |>
bind_rows(logz)
return(datt)
}
table_fig <- function(tblname,size=1, tbl_title,type=can_type) {
cancer_type<-c("breast cancer triple-negative","breast cancer her2+","breast cancer her2-,ER/PR+","colorectal","pancreas")
gt::gt(dplyr::as_tibble(tblname,rownames="type")) |>
gt::tab_header(title = tbl_title,subtitle=cancer_type[type]) |>
gt::gtsave(paste0("stats_",deparse(substitute(tblname)),
"_table",type,".png"), path = "img/")
cowplot::ggdraw() + cowplot::draw_image(paste0("img/",
"stats_",deparse(substitute(tblname)),"_table",can_type,".png"), scale = size)
}
graph_effects_vert<-function(data=datt) {
library(grid)
library(data.table)
datt_DT<-as.data.table(datt)
datt_DT[,type :=(ifelse(type=="fixed effects","fe","re"))]
dt<-dcast(datt_DT,
# id + ord ~ type,
id ~ type,
value.var = c("estimate","ymin","ymax"))
dt$` ` <- paste(rep(" ", 35), collapse = " ")
ests<-c("estimate_fe","estimate_re","ymin_fe","ymin_re","ymax_fe","ymax_re")
dt[,(ests):=lapply(.SD, function(x) exp(x)),.SDcols = ests]
# form for estimates 1.2 (0.2, 2.9)\n1.6 (0.4, 3.5)
dt$CI <- paste(sprintf("%.1f (%.1f, %.1f)", dt$estimate_fe, dt$ymin_fe, dt$ymax_fe),
sprintf("%.1f (%.1f, %.1f)", dt$estimate_re, dt$ymin_re, dt$ymax_re),
sep = "\n")
setnames(dt, old = "id", new = "Gene")
tm <- forestploter::forest_theme(base_size = 10,
refline_gp = gpar(lty="solid"),
ci_pch = c(15, 15),
ci_col = c("#377eb8", "#4daf4a"),
footnote_gp = gpar(col = "blue"),
legend_name = "Estimates",
legend_value = c("Fixed effects", "Random effects"),
legend_position = "none",
vertline_lty = c("dashed"),
vertline_col = c("#d6604d"),
# Table cell padding, width 4 and heights 3
core = list(padding = unit(c(4, 3), "mm")),
gp = gpar(fontface = "italic")
)
p<-forestploter::forest(dt[,c(1,8, 9)],
est = list(dt$estimate_fe,dt$estimate_re),
lower = list(dt$ymin_fe,dt$ymin_re),
upper = list(dt$ymax_fe,dt$ymax_re),
ci_column = 2,
ref_line = 1,
x_trans = "log",
ticks_at = c(0.25,0.5, 2,4),
arrow_lab = c("Lower risk", "Higher risk"),
nudge_y = 0.4,
xlim=c(.2,5),
theme=tm)
return(p)
}
pvalr <- function(pvals, sig.limit = .001, digits = 3, html = FALSE) {
# https://stackoverflow.com/questions/23018256/printing-p-values-with-0-001
roundr <- function(x, digits = 1) {
res <- sprintf(paste0('%.', digits, 'f'), x)
zzz <- paste0('0.', paste(rep('0', digits), collapse = ''))
res[res == paste0('-', zzz)] <- zzz
res
}
sapply(pvals, function(x, sig.limit) {
if (x < sig.limit)
if (html)
return(sprintf('< %s', format(sig.limit))) else
return(sprintf('< %s', format(sig.limit)))
if (x > .1)
return(paste("=",roundr(x, digits = 2))) else
return(paste("=",roundr(x, digits = digits)))
}, sig.limit = sig.limit)
}(The algorithms for these functions used in the analysis and some of the manipulation of the results were based on examples from (Theodor A. Balan and Putter 2020). Any errors identified in adapting them or in their application to our example are ours alone.)
Stata code used to create the analytic data set of all 5 cancers from file of SEER variables distributed by project data steward
code
use "data/mort imputed 1013.dta"
mi extract 0
* reorder to match Paul's outputs
recode cancer_sitegroup (3=1 "breast, triple negative") ///
(2=2 "breast, HER2+") (1=3 "breast, ER/PR+ HER2-") (4=4 "colorectal") ///
(5=5 pancreas),gen(cancer_site1)
* surgcat has 5 categories should have y/n by Paul's outputs
gen surgcat=surg>0
recode sexrecode (1 9 =1 male) (2=0 female),gen(sex) label(male_sex)
label values sex male_sex
label var sex "Sex"
tostring yeardx,gen(year_str)
encode year_str,gen(yeardx_cat)
label define yn 1 "Yes" 0 "No"
label values married surgcat chemo radiation her2therapy yn
recode colon_rectum (1=0) (2=.) (3=1),gen(colon_rectal)
label define colon_rectal 1 rectal 0 colon
label values colon_rectal colon_rectal
label var colon_rectal "Colorectal type"
compress
qui dtable i.(pvgroup race age_group poverty rurality married yeardx_cat stage ///
surgcat chemo radiation state grade her2therapy colon_rectum) ///
,by(cancer_site1,nototals) export(table1.html,replace)
* switch to study window end 12/31/2021
stset, clear
stset time22, failure(death22)
gen survtime=_t
gen event=_d
save "data/mort",replace
de pvgroupOverview of modeling strategy
We used directed acyclic graphs to justify the selection of variables for the analysis.(Pearl 1995; Lipsky and Greenland 2022)
The primary predictor of interest is the gene group describing the presence or absence of specific pathogenic variants. The outcome is mortality. In addition the following variable groups were included:
Covariate list
| Clinical | Treatment | Shared ancestry | Access and quality | Selection into cohort |
|---|---|---|---|---|
| Age at diagnosis | Chemotherapy | Race | Poverty ( county level) | Genetic testing |
| Stage | Surgery | Rurality (county level) | ||
| Grade (cancer-specific) | Radiation | State | ||
| Rectal vs colon (cancer-specific) | Receipt of HER2-therapy (cancer-specific) |
As only people from the SEER registry who had genetic testing within 6 months of diagnosis were included, all of the results are conditioned on receipt of genetic testing even though this variable is not formally included in our analysis.
The goal of the analysis is to describe whether people who have one of several pathogenic variants (PVs) or gene groups would be predicted to have a different mortality hazard in comparison to the average hazard of people with the cancer. More specifically, for a person who has been diagnosed and is planning or undergoing treatment with a cancer of known stage and grade and then finds out that they have a PV, the pertinent question becomes whether this finding will have any implications for their mortality risk beyond what they already know about their diagnosis.
We could expect that the presence of a PV can potentially modify the age at which one gets cancer, the stage or grade at which it occurs and the treatment that is received. These clinical and treatment variables are then mediators between the presence of a PV and mortality as represented in the DAG below. However, given that a patient already knows that information, to answer the question described requires trying to estimate any direct effect of PV on mortality independent of any paths through the described mediators and any possible confounders of the relationship.
code
library(dagitty)
library(ggdag)
risk<-dagify(
Y ~ PV + C + M + C2, PV ~ C, M ~ PV + C2,
coords =list(
x=c(Y=7,C=3, PV=0, M =3 ,C2 = 7 ),
y=c(Y=0,C=-3,PV=0, M = 3, C2 = 3 )
),
exposure="PV",
outcome="Y",
labels=c(PV="PV", C= "PV-outcome \n Confounder" , Y=" Mortality", M = "Clinical/treatment Mediator",
C2="Mediator-outcome \n confounder")
)
myplot(risk)We wish to block all the clinical/treatment mediators as well as any confounders of the PV - mortality relationship in order to test for whether there is any direct effect of PV on mortality. Unfortunately, once we adjust for the mediators we need to also account for any mediator - outcome and any PV - mediator confounders as well. If we do not then the correlations in that pathway will be included in our estimate of the PV - mortality relationship.(Elwert and Winship 2014)
In the more detailed DAG below we show the mediators whose paths we wish to block (age at diagnosis, stage/grade and the treatment received which includes chemotherapy, surgery and radiation). We also include the receipt of HER2-therapy as a disease specific treatment variable for breast cancer and Rectal vs Colon location for colorectal cancer.
code
risk<-dagify(
Y ~ PV + M1 + M2 + M3 + C2 + R + YD, PV ~ R,
M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
M3 ~ M1 + M2 + YD, R ~ C,
GT ~ YD + M1 + M2 + C2 + R,
coords =list(
x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7),
y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5 )
),
exposure="PV",
outcome="Y",
labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
)
set.seed(7)
set.seed(9670)
myplot(risk)There are not many confounders of the direct PV - mortality or PV - mediator relationships as in order to be a confounder it has to cause (requiring it to come before) the PV as well as the outcome. However, the most commonly raised potential confounder of gene outcome relationships is shared ancestry. The primary measure of shared ancestry that we have is the SEER race/ethnicity variable. Of the gene groups we have BRCA1/2 is the only one with an obvious shared ancestry group of Ashkenazi Jews who share a founder mutation. However, apart from this one group it seems that BRCA1/2 prevalence is not uncommon and quite similar across race and ethnic groups(Hall et al. 2009) and the latter seems true for the other gene groups. But we decided to still include this race variable as as a proxy for shared ancestry.
Again, the biggest problem with estimating the direct effect of PV on mortality is that once we have included the mediators we have to now block all the potential mediator-outcome confounders labeled collectively in the green box in the DAG. For example, given that we hypothesize that PVs may cause treatment changes, then any confounder of the treatment -> mortality relationship, such as access or quality of care poverty which limits access to treatment, opens up a collider path with respect to the PV mortality relationship. Inclusion of treatment as a collider, while it blocks the mediating path from PV -> Treatment -> Mortality, it will open a path from PV -> treatment <- Poverty -> Mortality. Thus we include the county level variables poverty, rurality as well as state as a proxy for quality and access related potential mediator-outcome confounders.
A second potential mediator-outcome confounder is the year of diagnosis which can cause treatment changes if cancer therapy changes and also might have an effect on outcome due to other changes in supportive care occur over time. As such it also has to be blocked to avoid a backdoor path from the treatment mediator to the outcome which would bias the PV - mortality relationship. An interesting feature of year of diagnosis is that selection into our cohort changed quite a bit from 2013-2019 as genetic testing rates increased significantly.(Kurian et al. 2023) Thus year of diagnosis is causally related to genetic testing. Given the additional causal pathways from the clinical characteristics of the cancer and race to genetic testing, a pathway opens up from year of diagnosis to mortality as the implied genetic testing covariate is a collider on this pathway.(Elwert and Winship 2014) This explains an odd finding seen in the regression results which is a large decrease in the hazard ratio for later years of diagnosis as genetic testing spreads from the highest risk groups to lower risk groups and the cohorts include an increasing number of people with lower clinical risk.
People do not get into the cohort unless they have a genetic test. This automatically includes genetic testing as a covariate whether we want it or not. Genetic testing is more common in higher risk cancers (early age of diagnosis or high stage and grade) and also potentially causally related to race/ethnicity as well as some of the access and quality variables and the relationships are changing over time. The concern of the selection mechanism would be if it introduced any backdoor paths between PV and mortality. However the DAG is getting a bit complicated to for us to easily detect this at this point.
Fortunately, once we have encoded the causal relationships we propose exist we can use functions that apply an axiomatic system for replacing probability formulas containing the causal relationships with ordinary conditional probabilities. That allows us to identify whether the adjustment set is able to evaluate a specific relationship without bias. This axiomatic system has the name ‘do-calculus’ invented by Judea Pearl.(Pearl 2010) Below we can show that if we condition on the variables in the DAG above the relationship between PV and mortality are d-separated which would allow us to estimate the presence or absence of a direct effect.
code
risk1<-dagify(
Y ~ M1 + M2 + M3 + C2 + R + YD, PV ~ R,
M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
M3 ~ M1 + M2 + YD, R ~ C,
GT ~ YD + M1 + M2 + C2 + R,
coords =list(
x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7),
y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5 )
),
exposure="PV",
outcome="Y",
labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
)
set.seed(3920)
ggdag::ggdag_dseparated(
risk1,
from = "PV",
to = "Y",
controlling_for = c("M1", "M2", "M3","R","C2","GT","YD"),
text = FALSE,
use_labels = "label") +
theme_dag() +
labs(title = "If we add the confounders of all the mediators then PV and Y \n are independent as they should be if there is no direct path")Based on this we can argue that we may be able to identify the direct effect of PV on mortality. However, in case it is not obvious the two biggest assumptions in the selection of covariates with respect to identifying the PV - mortality relationship are as follows:
- There are no unmeasured confounders beyond race that capture important shared ancestry features that would cause both the PV and higher mortality.
- The covariate set of county level variables poverty, rurality as well as state are an adequate proxy for the quality and access related potential mediator-outcome confounders.
Although not always clear, both of these assumptions are required for all gene outcome studies if they condition on any treatment or clinical severity variables, not just ours. However, we believe proposing a specific DAG provides a principled way of justifying the selection of covariates and allows us to start to try to identify the key variable sets for making these estimates. Perhaps most importantly it offers a transparent way for others to critique our choices and to propose and defend through additional analyses the need for other covariates.
Fixed effects model
The fixed effects modeled are estimated using the coxph procedure from the Survival package.(Therneau et al. 2024)
Use of multilevel models to estimate gene group random effects
The implementation of multilevel models in survival data uses the term frailty models for a survival regression model that includes random effects. The term is often specifically used for those models that only include random effects for the intercept.(Austin 2017) When the random effects are shared by a cluster the models are known as shared frailty models. Given the mathematical form of the survival model, the random intercept in the linear predictor has a multiplicative effect on the hazard function.
For our purposes we could propose a data-generating hypothesis for the use of multilevel models that involves viewing the pathogenic variants that have been found in our population as a random sample of all the possible genetic variations that can have an effect on the mortality of people who develop cancer. However, in this case we are primarily using a multilevel model as a strategy to avoid over-fitting. Given that we are fitting 8 gene groups over 5 cancers there are a total of 40 estimates of interest. There is heterogeneity in the size of the gene groups with some quite small. Multilevel modeling can be thought of “as adaptive regularization where the model itself tries to learn how skeptical it should be.”(McElreath 2020) In this sense it has some similarities to other strategies to avoid over-fitting such as ridge regression. The gamma distribution acts as a regularizing prior for the gene effects although in this case the parameters for that prior are learned from the data.
For estimation we used the profile EM (expectation-maximization) algorithm as implemented in the FrailtyEM package.(Theodor Adrian Balan and Putter 2019)
Breast Cancer Models
Triple negative
code
can_type=1
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type)
my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race",
poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married",
yeardx_cat ~ "Year of diagnosis", stage ~ "Stage",
surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)",
radiation ~ "Radiation (Yes=1)", state~ "State",
her2therapy ~ "HER2 treatment (Yes=1)", grade ~ "Grade",
pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
gtsummary::modify_caption("Table 1. Covariates - Triple negative") |>
gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )| Characteristic1 | N = 11,8422 |
|---|---|
| Age (in decades from mean age) | -0.05 (-0.85, 0.75) |
| Race | |
| NH White | 6,061 (51%) |
| Asian | 1,105 (9.3%) |
| Black | 2,078 (18%) |
| Hispanic | 2,417 (20%) |
| Other | 181 (1.5%) |
| Poverty | |
| <10% poverty | 8,100 (68%) |
| 10%-19% poverty | 2,540 (21%) |
| 20%+ poverty | 1,202 (10%) |
| Rurality | |
| rural | 788 (6.7%) |
| urban | 11,054 (93%) |
| Married | 6,974 (59%) |
| Year of diagnosis | |
| 2013 | 1,348 (11%) |
| 2014 | 1,425 (12%) |
| 2015 | 1,606 (14%) |
| 2016 | 1,529 (13%) |
| 2017 | 1,668 (14%) |
| 2018 | 1,974 (17%) |
| 2019 | 2,292 (19%) |
| Stage | |
| 2 | 4,022 (37%) |
| 0 | 1,671 (16%) |
| 1 | 2,986 (28%) |
| 3 | 1,660 (15%) |
| 4 | 414 (3.9%) |
| missing | 1,089 |
| Surgery (Yes=1) | 7,751 (65%) |
| Chemotherapy (Yes=1) | 8,295 (70%) |
| Radiation (Yes=1) | 1,488 (13%) |
| State | |
| CA | 8,640 (73%) |
| GA | 3,202 (27%) |
| HER2 treatment (Yes=1) | 318 (2.7%) |
| Grade | |
| 2 | 1,890 (18%) |
| 1 | 345 (3.2%) |
| 3 | 8,287 (78%) |
| 4 | 105 (1.0%) |
| missing | 1,215 |
| Gene group | |
| Negative | 7,748 (65%) |
| ATM | 36 (0.3%) |
| BRCA 1/2 | 1,322 (11%) |
| CHEK2 | 69 (0.6%) |
| Lynch Syndrome | 48 (0.4%) |
| Other PV | 379 (3.2%) |
| PALB2 | 128 (1.1%) |
| VUS only | 2,112 (18%) |
| 1 Omitted category listed first for categorical variables | |
| 2 Median (Q1, Q3); n (%) | |
Model output
Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
m_emfrail<-get_random(mort,can_type,covar)
}
# fixed effects
m_fixed<-get_fixed(mort,covar)
# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p1<-graph_effects_vert(datt)
p1 <- forestploter::edit_plot(p1,
col = 1,
row = c(1:3,7),
gp = gpar(fontface = "italic"))
p1 <- forestploter::insert_text(p1, text = "Triple-Negative Breast Cancer",
part = "header",
col = 1:3,
gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",
round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p1 <- forestploter::insert_text(p1, text = my_chi,
part = "header", just = "right",
col = 3, gp = gpar(fontsize=10) )
ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)code
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age (in decades from mean age) | 1.00 | 0.95, 1.05 | >0.9 |
| Race | |||
| NH White | — | — | |
| Asian | 0.71 | 0.55, 0.92 | 0.010 |
| Black | 1.11 | 0.95, 1.29 | 0.2 |
| Hispanic | 1.09 | 0.93, 1.27 | 0.3 |
| Other | 0.38 | 0.14, 1.03 | 0.057 |
| Poverty | |||
| <10% poverty | — | — | |
| 10%-19% poverty | 0.91 | 0.77, 1.07 | 0.2 |
| 20%+ poverty | 1.09 | 0.90, 1.33 | 0.4 |
| Rurality | |||
| rural | — | — | |
| urban | 0.87 | 0.71, 1.07 | 0.2 |
| Married | |||
| No | — | — | |
| Yes | 0.97 | 0.86, 1.09 | 0.6 |
| Year of diagnosis | |||
| 2013 | — | — | |
| 2014 | 1.16 | 0.95, 1.41 | 0.2 |
| 2015 | 1.14 | 0.94, 1.38 | 0.2 |
| 2016 | 0.89 | 0.72, 1.09 | 0.3 |
| 2017 | 0.87 | 0.70, 1.07 | 0.2 |
| 2018 | 0.57 | 0.46, 0.72 | <0.001 |
| 2019 | 0.44 | 0.34, 0.57 | <0.001 |
| Stage | |||
| 2 | — | — | |
| 0 | 0.07 | 0.04, 0.13 | <0.001 |
| 1 | 0.37 | 0.30, 0.45 | <0.001 |
| 3 | 3.67 | 3.20, 4.20 | <0.001 |
| 4 | 11.3 | 9.49, 13.4 | <0.001 |
| Surgery (Yes=1) | |||
| No | — | — | |
| Yes | 1.60 | 1.37, 1.86 | <0.001 |
| Chemotherapy (Yes=1) | |||
| No | — | — | |
| Yes | 0.81 | 0.67, 0.99 | 0.035 |
| Radiation (Yes=1) | |||
| No | — | — | |
| Yes | 1.60 | 1.33, 1.94 | <0.001 |
| State | |||
| CA | — | — | |
| GA | 1.25 | 1.07, 1.45 | 0.004 |
| HER2 treatment (Yes=1) | |||
| No | — | — | |
| Yes | 1.14 | 0.86, 1.51 | 0.4 |
| Grade | |||
| 2 | — | — | |
| 1 | 0.79 | 0.47, 1.32 | 0.4 |
| 3 | 0.95 | 0.81, 1.11 | 0.5 |
| 4 | 0.81 | 0.38, 1.73 | 0.6 |
| Gene group | |||
| Negative | — | — | |
| ATM | 3.66 | 1.63, 8.21 | 0.002 |
| BRCA 1/2 | 0.62 | 0.51, 0.75 | <0.001 |
| CHEK2 | 0.34 | 0.08, 1.38 | 0.13 |
| Lynch Syndrome | 1.97 | 0.81, 4.75 | 0.13 |
| Other PV | 0.89 | 0.62, 1.27 | 0.5 |
| PALB2 | 0.73 | 0.40, 1.33 | 0.3 |
| VUS only | 0.99 | 0.84, 1.15 | 0.9 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.
Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)
p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)Breast HER2+
code
covar2<-covar1
can_type=2
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
gtsummary::modify_caption("Table 1. Covariates - HER2+") |>
gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )| Characteristic1 | N = 9,9572 |
|---|---|
| Age (in decades from mean age) | -0.24 (-0.84, 0.76) |
| Race | |
| NH White | 5,453 (55%) |
| Asian | 1,346 (14%) |
| Black | 1,149 (12%) |
| Hispanic | 1,928 (19%) |
| Other | 81 (0.8%) |
| Poverty | |
| <10% poverty | 7,011 (70%) |
| 10%-19% poverty | 2,054 (21%) |
| 20%+ poverty | 892 (9.0%) |
| Rurality | |
| rural | 612 (6.1%) |
| urban | 9,345 (94%) |
| Married | 6,244 (63%) |
| Year of diagnosis | |
| 2013 | 1,066 (11%) |
| 2014 | 1,182 (12%) |
| 2015 | 1,435 (14%) |
| 2016 | 1,442 (14%) |
| 2017 | 1,477 (15%) |
| 2018 | 1,556 (16%) |
| 2019 | 1,799 (18%) |
| Stage | |
| 2 | 3,508 (37%) |
| 0 | 276 (2.9%) |
| 1 | 3,992 (42%) |
| 3 | 1,173 (12%) |
| 4 | 547 (5.8%) |
| missing | 461 |
| Surgery (Yes=1) | 6,812 (68%) |
| Chemotherapy (Yes=1) | 8,317 (84%) |
| Radiation (Yes=1) | 1,018 (10%) |
| State | |
| CA | 7,400 (74%) |
| GA | 2,557 (26%) |
| HER2 treatment (Yes=1) | 7,087 (71%) |
| Grade | |
| 2 | 3,639 (38%) |
| 1 | 455 (4.8%) |
| 3 | 5,349 (56%) |
| 4 | 36 (0.4%) |
| missing | 478 |
| Gene group | |
| Negative | 6,829 (69%) |
| ATM | 108 (1.1%) |
| BRCA 1/2 | 351 (3.5%) |
| CHEK2 | 209 (2.1%) |
| Lynch Syndrome | 25 (0.3%) |
| Other PV | 300 (3.0%) |
| PALB2 | 45 (0.5%) |
| VUS only | 2,090 (21%) |
| 1 Omitted category listed first for categorical variables | |
| 2 Median (Q1, Q3); n (%) | |
Model output
Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
m_emfrail<-get_random(mort,can_type,covar) #random w. Lynch, fixed w.out
}
# fixed effects
levels(mort$pvgroup)<-c("Negative","ATM","BRCA 1/2","CHEK2","Other PV","Other PV","PALB2","VUS only") # Lynch(4) combined w other(6)
m_fixed<-get_fixed(mort,covar)
# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz) |>
mutate(type= replace_na(type,"fixed effects"))
p2<-graph_effects_vert(datt)
p2 <- forestploter::edit_plot(p2,
col = 1,
row = c(1:3,7),
gp = gpar(fontface = "italic"))
p2 <- forestploter::insert_text(p2, text = expression(bold(paste(bolditalic("HER2"),"-Positive Breast Cancer"))),
part = "header",
col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p2 <- forestploter::insert_text(p2, text = my_chi,
part = "header", just = "right",
col = 3, gp = gpar(fontsize=10) )
ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)The fixed effects cox model does not converge to stable estimates for the pvgroup when the Lynch genes are included. There are relatively few observations (25) in those with HER2+ breast cancer. Thus Lynch was collapsed into the “Other PV” category for the fixed effects analysis. That is why the estimate is missing in the above figure.
code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age (in decades) | 1.10 | 1.01, 1.19 | 0.028 |
| Race | |||
| NH White | — | — | |
| Asian | 1.09 | 0.80, 1.50 | 0.6 |
| Black | 1.42 | 1.08, 1.88 | 0.013 |
| Hispanic | 1.29 | 0.99, 1.69 | 0.057 |
| Other | 0.86 | 0.21, 3.47 | 0.8 |
| Poverty | |||
| <10% poverty | — | — | |
| 10%-19% poverty | 0.99 | 0.75, 1.30 | >0.9 |
| 20%+ poverty | 1.75 | 1.27, 2.42 | <0.001 |
| Rurality | |||
| rural | — | — | |
| urban | 1.30 | 0.84, 2.01 | 0.2 |
| Married | |||
| No | — | — | |
| Yes | 0.78 | 0.64, 0.94 | 0.011 |
| Year of diagnosis | |||
| 2013 | — | — | |
| 2014 | 1.02 | 0.76, 1.37 | >0.9 |
| 2015 | 0.83 | 0.61, 1.13 | 0.2 |
| 2016 | 0.89 | 0.64, 1.23 | 0.5 |
| 2017 | 0.81 | 0.56, 1.17 | 0.3 |
| 2018 | 0.65 | 0.41, 1.02 | 0.060 |
| 2019 | 0.67 | 0.38, 1.17 | 0.2 |
| Stage | |||
| 2 | — | — | |
| 0 | 0.08 | 0.02, 0.32 | <0.001 |
| 1 | 0.34 | 0.24, 0.49 | <0.001 |
| 3 | 3.01 | 2.36, 3.86 | <0.001 |
| 4 | 8.84 | 6.82, 11.5 | <0.001 |
| Surgery | |||
| No | — | — | |
| Yes | 1.48 | 1.12, 1.94 | 0.005 |
| Chemotherapy | |||
| No | — | — | |
| Yes | 0.56 | 0.41, 0.77 | <0.001 |
| Radiation | |||
| No | — | — | |
| Yes | 1.42 | 1.06, 1.92 | 0.020 |
| State | |||
| CA | — | — | |
| GA | 1.54 | 1.19, 1.99 | <0.001 |
| HER2 treatment | |||
| No | — | — | |
| Yes | 0.66 | 0.52, 0.83 | <0.001 |
| Grade | |||
| 2 | — | — | |
| 1 | 1.07 | 0.58, 1.95 | 0.8 |
| 3 | 1.41 | 1.13, 1.75 | 0.002 |
| 4 | 0.83 | 0.11, 6.02 | 0.9 |
| Gene group | |||
| Negative | — | — | |
| ATM | 0.43 | 0.11, 1.72 | 0.2 |
| BRCA 1/2 | 1.03 | 0.64, 1.67 | 0.9 |
| CHEK2 | 0.14 | 0.02, 1.02 | 0.052 |
| Other PV | 0.53 | 0.24, 1.20 | 0.13 |
| PALB2 | 0.93 | 0.23, 3.80 | >0.9 |
| VUS only | 0.79 | 0.60, 1.04 | 0.090 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.
Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)
p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)Breast ER/PR+, HER2-
code
covar3<-covar1
can_type=3 # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
gtsummary::modify_caption("Table 1. Covariates - ER/PR+,HER2-") |>
gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )| Characteristic1 | N = 48,4732 |
|---|---|
| Age (in decades) | -0.19 (-0.99, 0.91) |
| Race | |
| NH White | 29,727 (61%) |
| Asian | 5,827 (12%) |
| Black | 4,697 (9.7%) |
| Hispanic | 7,751 (16%) |
| Other | 471 (1.0%) |
| Poverty | |
| <10% poverty | 34,965 (72%) |
| 10%-19% poverty | 9,672 (20%) |
| 20%+ poverty | 3,836 (7.9%) |
| Rurality | |
| rural | 2,733 (5.6%) |
| urban | 45,740 (94%) |
| Married | 30,684 (63%) |
| Year of diagnosis | |
| 2013 | 4,811 (9.9%) |
| 2014 | 5,328 (11%) |
| 2015 | 6,039 (12%) |
| 2016 | 6,325 (13%) |
| 2017 | 6,827 (14%) |
| 2018 | 8,377 (17%) |
| 2019 | 10,766 (22%) |
| Stage | |
| 2 | 10,033 (21%) |
| 0 | 8,001 (17%) |
| 1 | 24,382 (52%) |
| 3 | 3,350 (7.1%) |
| 4 | 1,172 (2.5%) |
| missing | 1,535 |
| Surgery | 25,282 (52%) |
| Chemotherapy | 15,340 (32%) |
| Radiation | 16,196 (33%) |
| State | |
| CA | 36,878 (76%) |
| GA | 11,595 (24%) |
| HER2 treatment | 747 (1.5%) |
| Grade | |
| 2 | 23,404 (50%) |
| 1 | 12,285 (26%) |
| 3 | 11,262 (24%) |
| 4 | 209 (0.4%) |
| missing | 1,313 |
| Gene group | |
| Negative | 33,618 (69%) |
| ATM | 399 (0.8%) |
| BRCA 1/2 | 1,975 (4.1%) |
| CHEK2 | 723 (1.5%) |
| Lynch Syndrome | 206 (0.4%) |
| Other PV | 1,398 (2.9%) |
| PALB2 | 343 (0.7%) |
| VUS only | 9,811 (20%) |
| 1 Omitted category listed first for categorical variables | |
| 2 Median (Q1, Q3); n (%) | |
Model output
Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
m_emfrail<-get_random(mort,can_type,covar)
}
# fixed effects
m_fixed<-get_fixed(mort,covar)
# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
#plot(graph_effects_vert(datt))
p3<-graph_effects_vert(datt)
p3 <- forestploter::edit_plot(p3,
col = 1,
row = c(1:3,7),
gp = gpar(fontface = "italic"))
p3 <- forestploter::insert_text(p3, text = expression(bold(paste("ER/PR-Positive",bolditalic(" HER2"),"-Negative Breast Cancer"))),
part = "header",
col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p3 <- forestploter::insert_text(p3, text = my_chi,
part = "header", just = "right",
col = 3, gp = gpar(fontsize=10) )
ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age (in decades) | 1.24 | 1.19, 1.30 | <0.001 |
| Race | |||
| NH White | — | — | |
| Asian | 0.91 | 0.76, 1.10 | 0.4 |
| Black | 1.40 | 1.20, 1.63 | <0.001 |
| Hispanic | 1.14 | 0.98, 1.32 | 0.082 |
| Other | 0.90 | 0.45, 1.80 | 0.8 |
| Poverty | |||
| <10% poverty | — | — | |
| 10%-19% poverty | 1.20 | 1.04, 1.38 | 0.012 |
| 20%+ poverty | 1.51 | 1.26, 1.81 | <0.001 |
| Rurality | |||
| rural | — | — | |
| urban | 1.0 | 0.81, 1.22 | >0.9 |
| Married | |||
| No | — | — | |
| Yes | 0.79 | 0.71, 0.87 | <0.001 |
| Year of diagnosis | |||
| 2013 | — | — | |
| 2014 | 0.82 | 0.70, 0.96 | 0.016 |
| 2015 | 0.82 | 0.70, 0.97 | 0.017 |
| 2016 | 0.76 | 0.64, 0.90 | 0.002 |
| 2017 | 0.60 | 0.49, 0.73 | <0.001 |
| 2018 | 0.60 | 0.48, 0.75 | <0.001 |
| 2019 | 0.53 | 0.41, 0.68 | <0.001 |
| Stage | |||
| 2 | — | — | |
| 0 | 0.14 | 0.10, 0.20 | <0.001 |
| 1 | 0.52 | 0.44, 0.61 | <0.001 |
| 3 | 2.95 | 2.58, 3.38 | <0.001 |
| 4 | 12.2 | 10.6, 14.2 | <0.001 |
| Surgery | |||
| No | — | — | |
| Yes | 1.46 | 1.28, 1.67 | <0.001 |
| Chemotherapy | |||
| No | — | — | |
| Yes | 1.30 | 1.14, 1.49 | <0.001 |
| Radiation | |||
| No | — | — | |
| Yes | 0.95 | 0.82, 1.10 | 0.5 |
| State | |||
| CA | — | — | |
| GA | 1.54 | 1.35, 1.76 | <0.001 |
| HER2 treatment | |||
| No | — | — | |
| Yes | 1.01 | 0.79, 1.29 | >0.9 |
| Grade | |||
| 2 | — | — | |
| 1 | 0.64 | 0.53, 0.77 | <0.001 |
| 3 | 2.17 | 1.95, 2.42 | <0.001 |
| 4 | 1.02 | 0.32, 3.18 | >0.9 |
| Gene group | |||
| Negative | — | — | |
| ATM | 0.75 | 0.39, 1.44 | 0.4 |
| BRCA 1/2 | 1.23 | 1.03, 1.47 | 0.025 |
| CHEK2 | 0.91 | 0.58, 1.44 | 0.7 |
| Lynch Syndrome | 1.97 | 1.05, 3.68 | 0.034 |
| Other PV | 1.07 | 0.77, 1.50 | 0.7 |
| PALB2 | 1.20 | 0.73, 1.98 | 0.5 |
| VUS only | 0.94 | 0.81, 1.09 | 0.4 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.
Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)
p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)Colorectal
code
covar4<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage",
"surgcat","chemo","radiation","state","colon_rectal","grade","pvgroup") #colorectal
can_type=4
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)
my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married",
yeardx_cat ~ "Year of diagnosis", stage ~ "Stage",
surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)",
radiation ~ "Radiation (Yes=1)", state~ "State",
colon_rectal ~ "Colorectal (Rectal=1)", grade ~ "Grade",
pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
gtsummary::modify_caption("Table 1. Covariates - Colorectal") |>
gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )| Characteristic1 | N = 5,8222 |
|---|---|
| Age (in decades from mean age) | -0.20 (-1.00, 1.00) |
| Race | |
| NH White | 3,396 (58%) |
| Asian | 589 (10%) |
| Black | 534 (9.2%) |
| Hispanic | 1,244 (21%) |
| Other | 59 (1.0%) |
| Sex | |
| female | 3,148 (54%) |
| male | 2,674 (46%) |
| Poverty | |
| <10% poverty | 3,915 (67%) |
| 10%-19% poverty | 1,315 (23%) |
| 20%+ poverty | 592 (10%) |
| Rurality | |
| rural | 399 (6.9%) |
| urban | 5,423 (93%) |
| Married | 3,531 (61%) |
| Year of diagnosis | |
| 2013 | 366 (6.3%) |
| 2014 | 509 (8.7%) |
| 2015 | 638 (11%) |
| 2016 | 787 (14%) |
| 2017 | 941 (16%) |
| 2018 | 1,103 (19%) |
| 2019 | 1,478 (25%) |
| Stage | |
| 2 | 1,377 (25%) |
| 0 | 146 (2.6%) |
| 1 | 1,100 (20%) |
| 3 | 1,770 (32%) |
| 4 | 1,143 (21%) |
| missing | 286 |
| Surgery (Yes=1) | 4,798 (82%) |
| Chemotherapy (Yes=1) | 3,344 (57%) |
| Radiation (Yes=1) | 834 (14%) |
| State | |
| CA | 4,483 (77%) |
| GA | 1,339 (23%) |
| Colorectal (Rectal=1) | |
| colon | 4,148 (71%) |
| rectal | 1,674 (29%) |
| Grade | |
| 2 | 3,609 (68%) |
| 1 | 722 (14%) |
| 3 | 881 (17%) |
| 4 | 103 (1.9%) |
| missing | 507 |
| Gene group | |
| Negative | 3,468 (60%) |
| ATM | 46 (0.8%) |
| BRCA 1/2 | 88 (1.5%) |
| CHEK2 | 48 (0.8%) |
| Lynch Syndrome | 661 (11%) |
| Other PV | 308 (5.3%) |
| PALB2 | 9 (0.2%) |
| VUS only | 1,194 (21%) |
| 1 Omitted category listed first for categorical variables | |
| 2 Median (Q1, Q3); n (%) | |
Model output
Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
m_emfrail<-get_random(mort,can_type,covar)
}
# fixed effects
m_fixed<-get_fixed(mort,covar)
# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p4<-graph_effects_vert(datt)
p4 <- forestploter::edit_plot(p4,
col = 1,
row = c(1:3,7),
gp = gpar(fontface = "italic"))
p4 <- forestploter::insert_text(p4, text = "Colorectal Cancer",
part = "header",
col = 1:3,
gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p4 <- forestploter::insert_text(p4, text = my_chi,
part = "header", just = "right",
col = 3, gp = gpar(fontsize=10) )
ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
radiation ~ "Radiation", state~ "State",
grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age (in decades) | 1.16 | 1.10, 1.23 | <0.001 |
| Race | |||
| NH White | — | — | |
| Asian | 1.13 | 0.87, 1.46 | 0.4 |
| Black | 1.09 | 0.86, 1.40 | 0.5 |
| Hispanic | 1.26 | 1.03, 1.53 | 0.022 |
| Other | 1.41 | 0.52, 3.81 | 0.5 |
| Sex | |||
| female | — | — | |
| male | 1.10 | 0.95, 1.27 | 0.2 |
| Poverty | |||
| <10% poverty | — | — | |
| 10%-19% poverty | 0.97 | 0.80, 1.19 | 0.8 |
| 20%+ poverty | 1.28 | 0.99, 1.67 | 0.061 |
| Rurality | |||
| rural | — | — | |
| urban | 0.84 | 0.64, 1.11 | 0.2 |
| Married | |||
| No | — | — | |
| Yes | 0.91 | 0.78, 1.05 | 0.2 |
| Year of diagnosis | |||
| 2013 | — | — | |
| 2014 | 0.89 | 0.67, 1.20 | 0.5 |
| 2015 | 0.84 | 0.62, 1.12 | 0.2 |
| 2016 | 0.62 | 0.47, 0.83 | 0.001 |
| 2017 | 0.63 | 0.47, 0.84 | 0.002 |
| 2018 | 0.60 | 0.44, 0.81 | <0.001 |
| 2019 | 0.33 | 0.23, 0.46 | <0.001 |
| Stage | |||
| 2 | — | — | |
| 0 | 1.58 | 0.63, 3.94 | 0.3 |
| 1 | 0.40 | 0.25, 0.65 | <0.001 |
| 3 | 1.96 | 1.45, 2.64 | <0.001 |
| 4 | 11.1 | 8.28, 15.0 | <0.001 |
| Surgery | |||
| No | — | — | |
| Yes | 0.53 | 0.45, 0.64 | <0.001 |
| Chemotherapy | |||
| No | — | — | |
| Yes | 1.16 | 0.88, 1.53 | 0.3 |
| Radiation | |||
| No | — | — | |
| Yes | 1.06 | 0.82, 1.37 | 0.7 |
| State | |||
| CA | — | — | |
| GA | 1.43 | 1.18, 1.73 | <0.001 |
| Colorectal type | |||
| colon | — | — | |
| rectal | 0.84 | 0.68, 1.03 | 0.10 |
| Grade | |||
| 2 | — | — | |
| 1 | 0.85 | 0.64, 1.15 | 0.3 |
| 3 | 1.88 | 1.59, 2.21 | <0.001 |
| 4 | 1.39 | 0.92, 2.08 | 0.12 |
| Gene group | |||
| Negative | — | — | |
| ATM | 1.33 | 0.63, 2.81 | 0.5 |
| BRCA 1/2 | 1.56 | 1.00, 2.44 | 0.048 |
| CHEK2 | 0.99 | 0.49, 2.01 | >0.9 |
| Lynch Syndrome | 0.58 | 0.43, 0.80 | <0.001 |
| Other PV | 0.79 | 0.55, 1.13 | 0.2 |
| PALB2 | 1.47 | 0.47, 4.62 | 0.5 |
| VUS only | 0.85 | 0.71, 1.03 | 0.10 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.
Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)
p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)Pancreas
code
covar5<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage","surgcat","chemo","radiation","state","pvgroup") #pancreas
can_type=5 # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)
my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married",
yeardx_cat ~ "Year of diagnosis", stage ~ "Stage",
surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)",
radiation ~ "Radiation (Yes=1)", state~ "State",
pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
gtsummary::modify_caption("Table 1. Covariates - Pancreas") |>
gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )| Characteristic1 | N = 1,8612 |
|---|---|
| Age (in decades from mean age) | 0.19 (-0.71, 0.89) |
| Race | |
| NH White | 1,224 (66%) |
| Asian | 227 (12%) |
| Black | 139 (7.5%) |
| Hispanic | 261 (14%) |
| Other | 10 (0.5%) |
| Sex | |
| female | 1,004 (54%) |
| male | 857 (46%) |
| Poverty | |
| <10% poverty | 1,371 (74%) |
| 10%-19% poverty | 343 (18%) |
| 20%+ poverty | 147 (7.9%) |
| Rurality | |
| rural | 130 (7.0%) |
| urban | 1,731 (93%) |
| Married | 1,268 (68%) |
| Year of diagnosis | |
| 2013 | 50 (2.7%) |
| 2014 | 65 (3.5%) |
| 2015 | 98 (5.3%) |
| 2016 | 172 (9.2%) |
| 2017 | 200 (11%) |
| 2018 | 395 (21%) |
| 2019 | 881 (47%) |
| Stage | |
| 2 | 382 (22%) |
| 0 | 3 (0.2%) |
| 1 | 288 (16%) |
| 3 | 320 (18%) |
| 4 | 771 (44%) |
| missing | 97 |
| Surgery (Yes=1) | 507 (27%) |
| Chemotherapy (Yes=1) | 1,625 (87%) |
| Radiation (Yes=1) | 177 (9.5%) |
| State | |
| CA | 1,518 (82%) |
| GA | 343 (18%) |
| Gene group | |
| Negative | 1,032 (55%) |
| ATM | 49 (2.6%) |
| BRCA 1/2 | 103 (5.5%) |
| CHEK2 | 23 (1.2%) |
| Lynch Syndrome | 15 (0.8%) |
| Other PV | 152 (8.2%) |
| PALB2 | 15 (0.8%) |
| VUS only | 472 (25%) |
| 1 Omitted category listed first for categorical variables | |
| 2 Median (Q1, Q3); n (%) | |
For pancreatic cancer the models stage 0 and 1 were collapsed given small sample sizes in those with Stage 0.
Model output
Random effects for gene group (as shown in figure 1)
code
mort$stage<-forcats::fct_collapse(mort$stage, "0-1" = c("0","1"))
# random effects
if(params$frailty=="on") {
m_emfrail<-get_random(mort,can_type,covar)
}
# fixed effects
m_fixed<-get_fixed(mort,covar)
# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p5<-graph_effects_vert(datt)
p5 <- forestploter::edit_plot(p5,
col = 1,
row = c(1:3,7),
gp = gpar(fontface = "italic"))
p5 <- forestploter::insert_text(p5, text = "Pancreatic Cancer",
part = "header",
col = 1:3,
gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p5 <- forestploter::insert_text(p5, text = my_chi,
part = "header", just = "right",
col = 3, gp = gpar(fontsize=10) )
ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
radiation ~ "Radiation", state~ "State",
pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)| Characteristic | HR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age (in decades) | 1.13 | 1.07, 1.20 | <0.001 |
| Race | |||
| NH White | — | — | |
| Asian | 1.08 | 0.88, 1.33 | 0.5 |
| Black | 0.95 | 0.75, 1.21 | 0.7 |
| Hispanic | 1.19 | 0.98, 1.45 | 0.076 |
| Other | 1.55 | 0.73, 3.31 | 0.3 |
| Sex | |||
| female | — | — | |
| male | 1.06 | 0.93, 1.20 | 0.4 |
| Poverty | |||
| <10% poverty | — | — | |
| 10%-19% poverty | 1.16 | 0.98, 1.38 | 0.091 |
| 20%+ poverty | 1.38 | 1.08, 1.76 | 0.009 |
| Rurality | |||
| rural | — | — | |
| urban | 1.14 | 0.89, 1.46 | 0.3 |
| Married | |||
| No | — | — | |
| Yes | 0.89 | 0.78, 1.03 | 0.11 |
| Year of diagnosis | |||
| 2013 | — | — | |
| 2014 | 0.82 | 0.54, 1.26 | 0.4 |
| 2015 | 0.69 | 0.46, 1.03 | 0.066 |
| 2016 | 0.67 | 0.46, 0.97 | 0.035 |
| 2017 | 0.61 | 0.42, 0.89 | 0.010 |
| 2018 | 0.58 | 0.41, 0.84 | 0.004 |
| 2019 | 0.44 | 0.31, 0.63 | <0.001 |
| Stage | |||
| 2 | — | — | |
| 0-1 | 0.86 | 0.67, 1.10 | 0.2 |
| 3 | 1.05 | 0.85, 1.31 | 0.6 |
| 4 | 1.52 | 1.25, 1.84 | <0.001 |
| Surgery | |||
| No | — | — | |
| Yes | 0.43 | 0.35, 0.52 | <0.001 |
| Chemotherapy | |||
| No | — | — | |
| Yes | 2.04 | 1.57, 2.64 | <0.001 |
| Radiation | |||
| No | — | — | |
| Yes | 0.87 | 0.70, 1.09 | 0.2 |
| State | |||
| CA | — | — | |
| GA | 1.67 | 1.41, 1.96 | <0.001 |
| Gene group | |||
| Negative | — | — | |
| ATM | 0.87 | 0.58, 1.29 | 0.5 |
| BRCA 1/2 | 0.68 | 0.52, 0.90 | 0.006 |
| CHEK2 | 0.61 | 0.33, 1.15 | 0.13 |
| Lynch Syndrome | 0.32 | 0.12, 0.85 | 0.023 |
| Other PV | 1.12 | 0.89, 1.41 | 0.3 |
| PALB2 | 1.03 | 0.51, 2.09 | >0.9 |
| VUS only | 1.00 | 0.85, 1.16 | >0.9 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | |||
Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.
Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)
p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)Arrange plots
Extract legend from dummy plot
code
p_leg<-ggplot(
datt, aes(x = ord, y = estimate, group = as.factor(type),
color = as.factor(type)) ) +
geom_point() + geom_line() +
scale_color_manual(values = c("#377eb8", "#4daf4a"), name="Estimates",
labels=c("Fixed", "Random"),
guide = guide_legend(override.aes = list(size = 3,shape=22,
fill=c("#377eb8", "#4daf4a")) )
) +
theme(legend.key = element_rect(fill = "white"),
legend.text=element_text(size=12),legend.title=element_text(size=12))
extract_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
my_legend<-extract_legend(p_leg)
#cowplot::plot_grid(p1,my_legend)Plot 5 figures and legend together for Figure 1.
code
p_final<-cowplot::plot_grid(p1,p2,p3,p4,p5,my_legend, ncol=2,
align="AUTO")
save(p_final,file="data/p_final.Rdata")
ggplot2::ggsave(p_final,
filename = "final_plot.pdf",
path = "img/",
units = "in", width =11.6, height = 15,
dpi = 300, bg = "white",device=cairo_pdf)