Main Analyses

Re-running frailty models currently set to off

Last run October 25, 2024 at 2:54PM-3:00PM (check modified date saved results)

Functions used in the analytic code below

# functions used for each cancer type
get_data<-function(cancer_type) {
  mort<-  haven::read_dta("data/mort.dta") |>    
    dplyr::filter(cancer_site1==cancer_type) |>  
    haven::as_factor() |>
    dplyr::mutate(stage = relevel(as.factor(stage), 3)) |>      # set base stage II
    dplyr::mutate(agedec = agedec-mean(agedec)) |>
    dplyr::mutate(pvgroup = relevel(as.factor(pvgroup), 5)) |>    # set base none
    dplyr::mutate(grade = relevel(as.factor(grade),2))  |>
    dplyr::mutate(race = relevel(race,4)) |>
    dplyr::mutate(rurality = relevel(rurality,1)) |>
    dplyr::mutate(poverty = relevel(poverty,3)) 
  return(mort)
}

get_fixed<-function(data,covar) {
  outcome<-"survival::Surv(survtime, event)"
  f<-as.formula(paste(outcome,paste(covar,collapse="+"),sep="~"))
  m_fixed<-survival::coxph(f, data, ties = "breslow")
  return(m_fixed)
}

get_random<-function(data,can_type,covar) {
  library(coxme)
  outcome<-"survival::Surv(survtime, event)"
  f<-as.formula(paste(outcome,paste(paste(covar[-length(covar)],collapse="+"),
                                    "cluster(pvgroup)", sep="+"),sep="~"))
  m_emfrail<-frailtyEM::emfrail(f, data)
  save_file<-paste0("data/","m_emfrail",can_type,".Rdata")
  save(m_emfrail,file=save_file)
  return(m_emfrail)
}

process_fixed<-function(data=m_fixed) {
  library(tidyverse)
  # Obtain center estimates and calculate standard error
  pvgroup <- grep("pvgroup", names(m_fixed$coefficients))
  nc <- length(pvgroup)
  var_pvgroup <- m_fixed$var[pvgroup, pvgroup]
  varmean <- sum(var_pvgroup)/ nc^2
  
  newse <- sqrt(c(varmean, diag(var_pvgroup) + varmean - 2/nc * apply(var_pvgroup, 1, sum)))
  # change contrast for hr's to average
  fe<-data.frame(beta = c(pvgroup5 = 0, m_fixed$coefficients[pvgroup]), sd = newse) |> 
    rownames_to_column("pvgroup") |> 
    mutate(pvgroup = substr(pvgroup,8,25)) |>
    mutate( pvgroup = base::replace(pvgroup, pvgroup=="5", "Negative"))  |>
    mutate(sbeta = sum(beta) / nc) |> 
    mutate(beta = beta - sbeta) |> 
    arrange(beta) |>
    mutate(ord = 1:n()) |> 
    mutate(pvgroup = as.character(pvgroup)) |> 
    mutate(ymin = beta - 1.96 * sd, ymax = beta + 1.96 * sd) |> 
    rename(id = pvgroup, estimate = beta) |> 
    select(-sd, -sbeta, -ord) |>
    mutate(type = "fixed effects") |> 
    mutate(ord = 1:n()) 
  return(fe)
}


process_random <- function(data=m_emfrail) {
  library(data.table)
  frailties<-as.data.table(frailtyEM:::summary.emfrail(data)[[12]])
  logz <- frailties[order(z),.(id,z,lower_q, upper_q)] |>
              dplyr::mutate_if(is.numeric, log) |> 
              dplyr::rename(estimate = z, ymin = lower_q, ymax = upper_q) |> 
              dplyr::mutate(type = "frailty") |>
              dplyr::mutate(ord = 1:dplyr::n())
return(logz)
}

combine_effects<-function(fe,logz) {
  ord_re <- data.frame(id = logz$id, ord = logz$ord)
  datt <- fe |>  
      right_join(ord_re,join_by(id==id)) |> 
      select(-ord.y) |> rename(ord=ord.x) |> 
      bind_rows(logz)
  return(datt)
}

table_fig <- function(tblname,size=1, tbl_title,type=can_type) {
  cancer_type<-c("breast cancer triple-negative","breast cancer her2+","breast cancer her2-,ER/PR+","colorectal","pancreas")
  gt::gt(dplyr::as_tibble(tblname,rownames="type")) |> 
    gt::tab_header(title = tbl_title,subtitle=cancer_type[type]) |>
    gt::gtsave(paste0("stats_",deparse(substitute(tblname)),
                      "_table",type,".png"), path = "img/")
  cowplot::ggdraw() + cowplot::draw_image(paste0("img/",
                                                 "stats_",deparse(substitute(tblname)),"_table",can_type,".png"), scale = size)
}

graph_effects_vert<-function(data=datt) {
  library(grid)
  library(data.table)
  datt_DT<-as.data.table(datt)
  datt_DT[,type :=(ifelse(type=="fixed effects","fe","re"))]
  
  dt<-dcast(datt_DT,
#            id + ord ~ type,
            id ~ type,
            value.var = c("estimate","ymin","ymax")) 
  dt$` ` <- paste(rep(" ", 35), collapse = " ")
  ests<-c("estimate_fe","estimate_re","ymin_fe","ymin_re","ymax_fe","ymax_re")
  dt[,(ests):=lapply(.SD, function(x) exp(x)),.SDcols = ests]
  
  #  form for estimates 1.2 (0.2, 2.9)\n1.6 (0.4, 3.5)
  dt$CI <- paste(sprintf("%.1f (%.1f, %.1f)", dt$estimate_fe, dt$ymin_fe, dt$ymax_fe),
                 sprintf("%.1f (%.1f, %.1f)", dt$estimate_re, dt$ymin_re, dt$ymax_re),
                 sep = "\n")
  setnames(dt, old = "id", new = "Gene")
  tm <- forestploter::forest_theme(base_size = 10,
                     refline_gp = gpar(lty="solid"),
                     ci_pch = c(15, 15),
                     ci_col = c("#377eb8", "#4daf4a"),
                     footnote_gp = gpar(col = "blue"),
                     legend_name = "Estimates",
                     legend_value = c("Fixed effects", "Random effects"),
                     legend_position = "none",
                     vertline_lty = c("dashed"),
                     vertline_col = c("#d6604d"),
                     # Table cell padding, width 4 and heights 3
                     core = list(padding = unit(c(4, 3), "mm")),
                     gp = gpar(fontface = "italic")
                     )
p<-forestploter::forest(dt[,c(1,8, 9)],
                          est = list(dt$estimate_fe,dt$estimate_re),
                          lower = list(dt$ymin_fe,dt$ymin_re), 
                          upper = list(dt$ymax_fe,dt$ymax_re),
                          ci_column = 2,
                          ref_line = 1, 
                          x_trans = "log",
                          ticks_at = c(0.25,0.5, 2,4),
                          arrow_lab = c("Lower risk", "Higher risk"),
                          nudge_y = 0.4,
                          xlim=c(.2,5),
                          theme=tm)
  return(p)
}
  
pvalr <- function(pvals, sig.limit = .001, digits = 3, html = FALSE) {
# https://stackoverflow.com/questions/23018256/printing-p-values-with-0-001
  roundr <- function(x, digits = 1) {
    res <- sprintf(paste0('%.', digits, 'f'), x)
    zzz <- paste0('0.', paste(rep('0', digits), collapse = ''))
    res[res == paste0('-', zzz)] <- zzz
    res
  }
  
  sapply(pvals, function(x, sig.limit) {
    if (x < sig.limit)
      if (html)
        return(sprintf('&lt; %s', format(sig.limit))) else
          return(sprintf('< %s', format(sig.limit)))
    if (x > .1)
      return(paste("=",roundr(x, digits = 2))) else
        return(paste("=",roundr(x, digits = digits)))
  }, sig.limit = sig.limit)
}

(The algorithms for these functions used in the analysis and some of the manipulation of the results were based on examples from (Theodor A. Balan and Putter 2020). Any errors identified in adapting them or in their application to our example are ours alone.)

Dataset

Stata code used to create the analytic data set of all 5 cancers from file of SEER variables distributed by project data steward

code

use "data/mort imputed 1013.dta"
mi extract 0
* reorder to match Paul's outputs
recode cancer_sitegroup  (3=1 "breast, triple negative") ///
  (2=2 "breast, HER2+") (1=3 "breast, ER/PR+ HER2-") (4=4 "colorectal") ///
  (5=5 pancreas),gen(cancer_site1)
* surgcat has 5 categories should have y/n by Paul's outputs
gen surgcat=surg>0
recode sexrecode (1 9 =1 male) (2=0 female),gen(sex) label(male_sex)
label values sex male_sex
label var sex "Sex"
tostring yeardx,gen(year_str)
encode year_str,gen(yeardx_cat)
label define yn 1 "Yes" 0 "No"
label values married surgcat chemo radiation her2therapy yn
recode colon_rectum (1=0) (2=.) (3=1),gen(colon_rectal)
label define colon_rectal 1 rectal 0 colon 
label values colon_rectal colon_rectal
label var colon_rectal "Colorectal type"
compress
qui dtable i.(pvgroup race age_group poverty rurality married yeardx_cat stage ///
                surgcat chemo radiation state grade her2therapy colon_rectum) ///
              ,by(cancer_site1,nototals) export(table1.html,replace)
* switch to study window end 12/31/2021
stset, clear
stset time22, failure(death22)
gen survtime=_t
gen event=_d
save "data/mort",replace
de pvgroup

Overview of modeling strategy

We used directed acyclic graphs to justify the selection of variables for the analysis.(Pearl 1995; Lipsky and Greenland 2022)

The primary predictor of interest is the gene group describing the presence or absence of specific pathogenic variants. The outcome is mortality. In addition the following variable groups were included:

Covariate list

Clinical	Treatment	Shared ancestry	Access and quality	Selection into cohort
Age at diagnosis	Chemotherapy	Race	Poverty ( county level)	Genetic testing
Stage	Surgery		Rurality (county level)
Grade (cancer-specific)	Radiation		State
Rectal vs colon (cancer-specific)	Receipt of HER2-therapy (cancer-specific)

As only people from the SEER registry who had genetic testing within 6 months of diagnosis were included, all of the results are conditioned on receipt of genetic testing even though this variable is not formally included in our analysis.

Variable selection - rationale (click to view)

The goal of the analysis is to describe whether people who have one of several pathogenic variants (PVs) or gene groups would be predicted to have a different mortality hazard in comparison to the average hazard of people with the cancer. More specifically, for a person who has been diagnosed and is planning or undergoing treatment with a cancer of known stage and grade and then finds out that they have a PV, the pertinent question becomes whether this finding will have any implications for their mortality risk beyond what they already know about their diagnosis.

We could expect that the presence of a PV can potentially modify the age at which one gets cancer, the stage or grade at which it occurs and the treatment that is received. These clinical and treatment variables are then mediators between the presence of a PV and mortality as represented in the DAG below. However, given that a patient already knows that information, to answer the question described requires trying to estimate any direct effect of PV on mortality independent of any paths through the described mediators and any possible confounders of the relationship.

code

library(dagitty)
library(ggdag)
risk<-dagify(
  Y ~ PV + C + M + C2, PV ~ C, M ~ PV + C2, 
  coords =list(
    x=c(Y=7,C=3, PV=0, M =3 ,C2 = 7 ), 
    y=c(Y=0,C=-3,PV=0, M = 3, C2 = 3  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "PV-outcome \n Confounder" , Y=" Mortality", M = "Clinical/treatment Mediator",
           C2="Mediator-outcome \n confounder")
) 
myplot(risk)

We wish to block all the clinical/treatment mediators as well as any confounders of the PV - mortality relationship in order to test for whether there is any direct effect of PV on mortality. Unfortunately, once we adjust for the mediators we need to also account for any mediator - outcome and any PV - mediator confounders as well. If we do not then the correlations in that pathway will be included in our estimate of the PV - mortality relationship.(Elwert and Winship 2014)

In the more detailed DAG below we show the mediators whose paths we wish to block (age at diagnosis, stage/grade and the treatment received which includes chemotherapy, surgery and radiation). We also include the receipt of HER2-therapy as a disease specific treatment variable for breast cancer and Rectal vs Colon location for colorectal cancer.

code

risk<-dagify(
  Y ~  PV + M1 + M2 + M3 + C2 + R + YD, PV ~ R,
    M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
    M3 ~ M1 + M2 + YD, R ~ C, 
    GT ~ YD + M1 + M2 + C2 + R,  
  coords =list(
    x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7), 
    y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
           M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
           R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
) 

set.seed(7)
set.seed(9670)
myplot(risk)

There are not many confounders of the direct PV - mortality or PV - mediator relationships as in order to be a confounder it has to cause (requiring it to come before) the PV as well as the outcome. However, the most commonly raised potential confounder of gene outcome relationships is shared ancestry. The primary measure of shared ancestry that we have is the SEER race/ethnicity variable. Of the gene groups we have BRCA1/2 is the only one with an obvious shared ancestry group of Ashkenazi Jews who share a founder mutation. However, apart from this one group it seems that BRCA1/2 prevalence is not uncommon and quite similar across race and ethnic groups(Hall et al. 2009) and the latter seems true for the other gene groups. But we decided to still include this race variable as as a proxy for shared ancestry.

Again, the biggest problem with estimating the direct effect of PV on mortality is that once we have included the mediators we have to now block all the potential mediator-outcome confounders labeled collectively in the green box in the DAG. For example, given that we hypothesize that PVs may cause treatment changes, then any confounder of the treatment -> mortality relationship, such as access or quality of care poverty which limits access to treatment, opens up a collider path with respect to the PV mortality relationship. Inclusion of treatment as a collider, while it blocks the mediating path from PV -> Treatment -> Mortality, it will open a path from PV -> treatment <- Poverty -> Mortality. Thus we include the county level variables poverty, rurality as well as state as a proxy for quality and access related potential mediator-outcome confounders.

A second potential mediator-outcome confounder is the year of diagnosis which can cause treatment changes if cancer therapy changes and also might have an effect on outcome due to other changes in supportive care occur over time. As such it also has to be blocked to avoid a backdoor path from the treatment mediator to the outcome which would bias the PV - mortality relationship. An interesting feature of year of diagnosis is that selection into our cohort changed quite a bit from 2013-2019 as genetic testing rates increased significantly.(Kurian et al. 2023) Thus year of diagnosis is causally related to genetic testing. Given the additional causal pathways from the clinical characteristics of the cancer and race to genetic testing, a pathway opens up from year of diagnosis to mortality as the implied genetic testing covariate is a collider on this pathway.(Elwert and Winship 2014) This explains an odd finding seen in the regression results which is a large decrease in the hazard ratio for later years of diagnosis as genetic testing spreads from the highest risk groups to lower risk groups and the cohorts include an increasing number of people with lower clinical risk.

People do not get into the cohort unless they have a genetic test. This automatically includes genetic testing as a covariate whether we want it or not. Genetic testing is more common in higher risk cancers (early age of diagnosis or high stage and grade) and also potentially causally related to race/ethnicity as well as some of the access and quality variables and the relationships are changing over time. The concern of the selection mechanism would be if it introduced any backdoor paths between PV and mortality. However the DAG is getting a bit complicated to for us to easily detect this at this point.

Fortunately, once we have encoded the causal relationships we propose exist we can use functions that apply an axiomatic system for replacing probability formulas containing the causal relationships with ordinary conditional probabilities. That allows us to identify whether the adjustment set is able to evaluate a specific relationship without bias. This axiomatic system has the name ‘do-calculus’ invented by Judea Pearl.(Pearl 2010) Below we can show that if we condition on the variables in the DAG above the relationship between PV and mortality are d-separated which would allow us to estimate the presence or absence of a direct effect.

code

risk1<-dagify(
  Y ~   M1 + M2 + M3 + C2 + R + YD, PV ~ R,
    M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
    M3 ~ M1 + M2 + YD, R ~ C, 
    GT ~ YD + M1 + M2 + C2 + R,  
  coords =list(
    x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7), 
    y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
           M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
           R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
) 
set.seed(3920)
ggdag::ggdag_dseparated(
  risk1,
  from = "PV",
  to = "Y",
  controlling_for = c("M1", "M2", "M3","R","C2","GT","YD"),
  text = FALSE,
  use_labels  = "label") +
  theme_dag() +
  labs(title = "If we add the confounders of all the mediators then PV and Y \n are independent as they should be if there is no direct path")

Based on this we can argue that we may be able to identify the direct effect of PV on mortality. However, in case it is not obvious the two biggest assumptions in the selection of covariates with respect to identifying the PV - mortality relationship are as follows:

There are no unmeasured confounders beyond race that capture important shared ancestry features that would cause both the PV and higher mortality.
The covariate set of county level variables poverty, rurality as well as state are an adequate proxy for the quality and access related potential mediator-outcome confounders.

Although not always clear, both of these assumptions are required for all gene outcome studies if they condition on any treatment or clinical severity variables, not just ours. However, we believe proposing a specific DAG provides a principled way of justifying the selection of covariates and allows us to start to try to identify the key variable sets for making these estimates. Perhaps most importantly it offers a transparent way for others to critique our choices and to propose and defend through additional analyses the need for other covariates.

Fixed effects model

The fixed effects modeled are estimated using the coxph procedure from the Survival package.(Therneau et al. 2024)

Use of multilevel models to estimate gene group random effects

The implementation of multilevel models in survival data uses the term frailty models for a survival regression model that includes random effects. The term is often specifically used for those models that only include random effects for the intercept.(Austin 2017) When the random effects are shared by a cluster the models are known as shared frailty models. Given the mathematical form of the survival model, the random intercept in the linear predictor has a multiplicative effect on the hazard function.

For our purposes we could propose a data-generating hypothesis for the use of multilevel models that involves viewing the pathogenic variants that have been found in our population as a random sample of all the possible genetic variations that can have an effect on the mortality of people who develop cancer. However, in this case we are primarily using a multilevel model as a strategy to avoid over-fitting. Given that we are fitting 8 gene groups over 5 cancers there are a total of 40 estimates of interest. There is heterogeneity in the size of the gene groups with some quite small. Multilevel modeling can be thought of “as adaptive regularization where the model itself tries to learn how skeptical it should be.”(McElreath 2020) In this sense it has some similarities to other strategies to avoid over-fitting such as ridge regression. The gamma distribution acts as a regularizing prior for the gene effects although in this case the parameters for that prior are learned from the data.

For estimation we used the profile EM (expectation-maximization) algorithm as implemented in the FrailtyEM package.(Theodor Adrian Balan and Putter 2019)

Breast Cancer Models

Triple negative

Sample description and missing data (click to view)

code

can_type=1
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type) 
my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", 
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                her2therapy ~ "HER2 treatment (Yes=1)", grade ~ "Grade", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Triple negative") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )

Table 1. Covariates - Triple negative
Characteristic¹	N = 11,842²
Age (in decades from mean age)	-0.05 (-0.85, 0.75)
Race
NH White	6,061 (51%)
Asian	1,105 (9.3%)
Black	2,078 (18%)
Hispanic	2,417 (20%)
Other	181 (1.5%)
Poverty
<10% poverty	8,100 (68%)
10%-19% poverty	2,540 (21%)
20%+ poverty	1,202 (10%)
Rurality
rural	788 (6.7%)
urban	11,054 (93%)
Married	6,974 (59%)
Year of diagnosis
2013	1,348 (11%)
2014	1,425 (12%)
2015	1,606 (14%)
2016	1,529 (13%)
2017	1,668 (14%)
2018	1,974 (17%)
2019	2,292 (19%)
Stage
2	4,022 (37%)
0	1,671 (16%)
1	2,986 (28%)
3	1,660 (15%)
4	414 (3.9%)
missing	1,089
Surgery (Yes=1)	7,751 (65%)
Chemotherapy (Yes=1)	8,295 (70%)
Radiation (Yes=1)	1,488 (13%)
State
CA	8,640 (73%)
GA	3,202 (27%)
HER2 treatment (Yes=1)	318 (2.7%)
Grade
2	1,890 (18%)
1	345 (3.2%)
3	8,287 (78%)
4	105 (1.0%)
missing	1,215
Gene group
Negative	7,748 (65%)
ATM	36 (0.3%)
BRCA 1/2	1,322 (11%)
CHEK2	69 (0.6%)
Lynch Syndrome	48 (0.4%)
Other PV	379 (3.2%)
PALB2	128 (1.1%)
VUS only	2,112 (18%)
¹ Omitted category listed first for categorical variables
² Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)

code

# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p1<-graph_effects_vert(datt)
p1 <- forestploter::edit_plot(p1,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p1 <- forestploter::insert_text(p1, text = "Triple-Negative Breast Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",
               round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p1 <- forestploter::insert_text(p1, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Fixed effects model (click to view)

code

gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)

Characteristic	HR¹	95% CI¹	p-value
Age (in decades from mean age)	1.00	0.95, 1.05	>0.9
Race
NH White	—	—
Asian	0.71	0.55, 0.92	0.010
Black	1.11	0.95, 1.29	0.2
Hispanic	1.09	0.93, 1.27	0.3
Other	0.38	0.14, 1.03	0.057
Poverty
<10% poverty	—	—
10%-19% poverty	0.91	0.77, 1.07	0.2
20%+ poverty	1.09	0.90, 1.33	0.4
Rurality
rural	—	—
urban	0.87	0.71, 1.07	0.2
Married
No	—	—
Yes	0.97	0.86, 1.09	0.6
Year of diagnosis
2013	—	—
2014	1.16	0.95, 1.41	0.2
2015	1.14	0.94, 1.38	0.2
2016	0.89	0.72, 1.09	0.3
2017	0.87	0.70, 1.07	0.2
2018	0.57	0.46, 0.72	<0.001
2019	0.44	0.34, 0.57	<0.001
Stage
2	—	—
0	0.07	0.04, 0.13	<0.001
1	0.37	0.30, 0.45	<0.001
3	3.67	3.20, 4.20	<0.001
4	11.3	9.49, 13.4	<0.001
Surgery (Yes=1)
No	—	—
Yes	1.60	1.37, 1.86	<0.001
Chemotherapy (Yes=1)
No	—	—
Yes	0.81	0.67, 0.99	0.035
Radiation (Yes=1)
No	—	—
Yes	1.60	1.33, 1.94	<0.001
State
CA	—	—
GA	1.25	1.07, 1.45	0.004
HER2 treatment (Yes=1)
No	—	—
Yes	1.14	0.86, 1.51	0.4
Grade
2	—	—
1	0.79	0.47, 1.32	0.4
3	0.95	0.81, 1.11	0.5
4	0.81	0.38, 1.73	0.6
Gene group
Negative	—	—
ATM	3.66	1.63, 8.21	0.002
BRCA 1/2	0.62	0.51, 0.75	<0.001
CHEK2	0.34	0.08, 1.38	0.13
Lynch Syndrome	1.97	0.81, 4.75	0.13
Other PV	0.89	0.62, 1.27	0.5
PALB2	0.73	0.40, 1.33	0.3
VUS only	0.99	0.84, 1.15	0.9
¹ HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity

code

fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Breast HER2+

Sample description and missing data (click to view)

code

covar2<-covar1
can_type=2
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - HER2+") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )

Table 1. Covariates - HER2+
Characteristic¹	N = 9,957²
Age (in decades from mean age)	-0.24 (-0.84, 0.76)
Race
NH White	5,453 (55%)
Asian	1,346 (14%)
Black	1,149 (12%)
Hispanic	1,928 (19%)
Other	81 (0.8%)
Poverty
<10% poverty	7,011 (70%)
10%-19% poverty	2,054 (21%)
20%+ poverty	892 (9.0%)
Rurality
rural	612 (6.1%)
urban	9,345 (94%)
Married	6,244 (63%)
Year of diagnosis
2013	1,066 (11%)
2014	1,182 (12%)
2015	1,435 (14%)
2016	1,442 (14%)
2017	1,477 (15%)
2018	1,556 (16%)
2019	1,799 (18%)
Stage
2	3,508 (37%)
0	276 (2.9%)
1	3,992 (42%)
3	1,173 (12%)
4	547 (5.8%)
missing	461
Surgery (Yes=1)	6,812 (68%)
Chemotherapy (Yes=1)	8,317 (84%)
Radiation (Yes=1)	1,018 (10%)
State
CA	7,400 (74%)
GA	2,557 (26%)
HER2 treatment (Yes=1)	7,087 (71%)
Grade
2	3,639 (38%)
1	455 (4.8%)
3	5,349 (56%)
4	36 (0.4%)
missing	478
Gene group
Negative	6,829 (69%)
ATM	108 (1.1%)
BRCA 1/2	351 (3.5%)
CHEK2	209 (2.1%)
Lynch Syndrome	25 (0.3%)
Other PV	300 (3.0%)
PALB2	45 (0.5%)
VUS only	2,090 (21%)
¹ Omitted category listed first for categorical variables
² Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)

code

# random effects
if(params$frailty=="on") {
    m_emfrail<-get_random(mort,can_type,covar)  #random w. Lynch, fixed w.out
}

# fixed effects
levels(mort$pvgroup)<-c("Negative","ATM","BRCA 1/2","CHEK2","Other PV","Other PV","PALB2","VUS only") # Lynch(4) combined w other(6)
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)  |>
      mutate(type= replace_na(type,"fixed effects"))
p2<-graph_effects_vert(datt)
p2 <- forestploter::edit_plot(p2,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p2 <- forestploter::insert_text(p2, text = expression(bold(paste(bolditalic("HER2"),"-Positive Breast Cancer"))),
              part = "header",
              col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p2 <- forestploter::insert_text(p2, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

The fixed effects cox model does not converge to stable estimates for the pvgroup when the Lynch genes are included. There are relatively few observations (25) in those with HER2+ breast cancer. Thus Lynch was collapsed into the “Other PV” category for the fixed effects analysis. That is why the estimate is missing in the above figure.

Fixed effects model (click to view)

code

my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)

Characteristic	HR¹	95% CI¹	p-value
Age (in decades)	1.10	1.01, 1.19	0.028
Race
NH White	—	—
Asian	1.09	0.80, 1.50	0.6
Black	1.42	1.08, 1.88	0.013
Hispanic	1.29	0.99, 1.69	0.057
Other	0.86	0.21, 3.47	0.8
Poverty
<10% poverty	—	—
10%-19% poverty	0.99	0.75, 1.30	>0.9
20%+ poverty	1.75	1.27, 2.42	<0.001
Rurality
rural	—	—
urban	1.30	0.84, 2.01	0.2
Married
No	—	—
Yes	0.78	0.64, 0.94	0.011
Year of diagnosis
2013	—	—
2014	1.02	0.76, 1.37	>0.9
2015	0.83	0.61, 1.13	0.2
2016	0.89	0.64, 1.23	0.5
2017	0.81	0.56, 1.17	0.3
2018	0.65	0.41, 1.02	0.060
2019	0.67	0.38, 1.17	0.2
Stage
2	—	—
0	0.08	0.02, 0.32	<0.001
1	0.34	0.24, 0.49	<0.001
3	3.01	2.36, 3.86	<0.001
4	8.84	6.82, 11.5	<0.001
Surgery
No	—	—
Yes	1.48	1.12, 1.94	0.005
Chemotherapy
No	—	—
Yes	0.56	0.41, 0.77	<0.001
Radiation
No	—	—
Yes	1.42	1.06, 1.92	0.020
State
CA	—	—
GA	1.54	1.19, 1.99	<0.001
HER2 treatment
No	—	—
Yes	0.66	0.52, 0.83	<0.001
Grade
2	—	—
1	1.07	0.58, 1.95	0.8
3	1.41	1.13, 1.75	0.002
4	0.83	0.11, 6.02	0.9
Gene group
Negative	—	—
ATM	0.43	0.11, 1.72	0.2
BRCA 1/2	1.03	0.64, 1.67	0.9
CHEK2	0.14	0.02, 1.02	0.052
Other PV	0.53	0.24, 1.20	0.13
PALB2	0.93	0.23, 3.80	>0.9
VUS only	0.79	0.60, 1.04	0.090
¹ HR = Hazard Ratio, CI = Confidence Interval

Tests of heterogeneity

code

fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Breast ER/PR+, HER2-

Sample description and missing data (click to view)

code

covar3<-covar1
can_type=3  # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - ER/PR+,HER2-") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )

Table 1. Covariates - ER/PR+,HER2-
Characteristic¹	N = 48,473²
Age (in decades)	-0.19 (-0.99, 0.91)
Race
NH White	29,727 (61%)
Asian	5,827 (12%)
Black	4,697 (9.7%)
Hispanic	7,751 (16%)
Other	471 (1.0%)
Poverty
<10% poverty	34,965 (72%)
10%-19% poverty	9,672 (20%)
20%+ poverty	3,836 (7.9%)
Rurality
rural	2,733 (5.6%)
urban	45,740 (94%)
Married	30,684 (63%)
Year of diagnosis
2013	4,811 (9.9%)
2014	5,328 (11%)
2015	6,039 (12%)
2016	6,325 (13%)
2017	6,827 (14%)
2018	8,377 (17%)
2019	10,766 (22%)
Stage
2	10,033 (21%)
0	8,001 (17%)
1	24,382 (52%)
3	3,350 (7.1%)
4	1,172 (2.5%)
missing	1,535
Surgery	25,282 (52%)
Chemotherapy	15,340 (32%)
Radiation	16,196 (33%)
State
CA	36,878 (76%)
GA	11,595 (24%)
HER2 treatment	747 (1.5%)
Grade
2	23,404 (50%)
1	12,285 (26%)
3	11,262 (24%)
4	209 (0.4%)
missing	1,313
Gene group
Negative	33,618 (69%)
ATM	399 (0.8%)
BRCA 1/2	1,975 (4.1%)
CHEK2	723 (1.5%)
Lynch Syndrome	206 (0.4%)
Other PV	1,398 (2.9%)
PALB2	343 (0.7%)
VUS only	9,811 (20%)
¹ Omitted category listed first for categorical variables
² Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)

code

# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
#plot(graph_effects_vert(datt)) 
p3<-graph_effects_vert(datt)
p3 <- forestploter::edit_plot(p3,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p3 <- forestploter::insert_text(p3, text = expression(bold(paste("ER/PR-Positive",bolditalic(" HER2"),"-Negative Breast Cancer"))),
              part = "header",
              col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p3 <- forestploter::insert_text(p3, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Fixed effects model (click to view)

code

my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)

Characteristic	HR¹	95% CI¹	p-value
Age (in decades)	1.24	1.19, 1.30	<0.001
Race
NH White	—	—
Asian	0.91	0.76, 1.10	0.4
Black	1.40	1.20, 1.63	<0.001
Hispanic	1.14	0.98, 1.32	0.082
Other	0.90	0.45, 1.80	0.8
Poverty
<10% poverty	—	—
10%-19% poverty	1.20	1.04, 1.38	0.012
20%+ poverty	1.51	1.26, 1.81	<0.001
Rurality
rural	—	—
urban	1.0	0.81, 1.22	>0.9
Married
No	—	—
Yes	0.79	0.71, 0.87	<0.001
Year of diagnosis
2013	—	—
2014	0.82	0.70, 0.96	0.016
2015	0.82	0.70, 0.97	0.017
2016	0.76	0.64, 0.90	0.002
2017	0.60	0.49, 0.73	<0.001
2018	0.60	0.48, 0.75	<0.001
2019	0.53	0.41, 0.68	<0.001
Stage
2	—	—
0	0.14	0.10, 0.20	<0.001
1	0.52	0.44, 0.61	<0.001
3	2.95	2.58, 3.38	<0.001
4	12.2	10.6, 14.2	<0.001
Surgery
No	—	—
Yes	1.46	1.28, 1.67	<0.001
Chemotherapy
No	—	—
Yes	1.30	1.14, 1.49	<0.001
Radiation
No	—	—
Yes	0.95	0.82, 1.10	0.5
State
CA	—	—
GA	1.54	1.35, 1.76	<0.001
HER2 treatment
No	—	—
Yes	1.01	0.79, 1.29	>0.9
Grade
2	—	—
1	0.64	0.53, 0.77	<0.001
3	2.17	1.95, 2.42	<0.001
4	1.02	0.32, 3.18	>0.9
Gene group
Negative	—	—
ATM	0.75	0.39, 1.44	0.4
BRCA 1/2	1.23	1.03, 1.47	0.025
CHEK2	0.91	0.58, 1.44	0.7
Lynch Syndrome	1.97	1.05, 3.68	0.034
Other PV	1.07	0.77, 1.50	0.7
PALB2	1.20	0.73, 1.98	0.5
VUS only	0.94	0.81, 1.09	0.4
¹ HR = Hazard Ratio, CI = Confidence Interval

Tests of heterogeneity

code

fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Colorectal

Sample description and missing data (click to view)

code

covar4<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage",
          "surgcat","chemo","radiation","state","colon_rectal","grade","pvgroup")  #colorectal
can_type=4   
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type) 

my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                colon_rectal ~ "Colorectal (Rectal=1)", grade ~ "Grade", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Colorectal")  |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )

Table 1. Covariates - Colorectal
Characteristic¹	N = 5,822²
Age (in decades from mean age)	-0.20 (-1.00, 1.00)
Race
NH White	3,396 (58%)
Asian	589 (10%)
Black	534 (9.2%)
Hispanic	1,244 (21%)
Other	59 (1.0%)
Sex
female	3,148 (54%)
male	2,674 (46%)
Poverty
<10% poverty	3,915 (67%)
10%-19% poverty	1,315 (23%)
20%+ poverty	592 (10%)
Rurality
rural	399 (6.9%)
urban	5,423 (93%)
Married	3,531 (61%)
Year of diagnosis
2013	366 (6.3%)
2014	509 (8.7%)
2015	638 (11%)
2016	787 (14%)
2017	941 (16%)
2018	1,103 (19%)
2019	1,478 (25%)
Stage
2	1,377 (25%)
0	146 (2.6%)
1	1,100 (20%)
3	1,770 (32%)
4	1,143 (21%)
missing	286
Surgery (Yes=1)	4,798 (82%)
Chemotherapy (Yes=1)	3,344 (57%)
Radiation (Yes=1)	834 (14%)
State
CA	4,483 (77%)
GA	1,339 (23%)
Colorectal (Rectal=1)
colon	4,148 (71%)
rectal	1,674 (29%)
Grade
2	3,609 (68%)
1	722 (14%)
3	881 (17%)
4	103 (1.9%)
missing	507
Gene group
Negative	3,468 (60%)
ATM	46 (0.8%)
BRCA 1/2	88 (1.5%)
CHEK2	48 (0.8%)
Lynch Syndrome	661 (11%)
Other PV	308 (5.3%)
PALB2	9 (0.2%)
VUS only	1,194 (21%)
¹ Omitted category listed first for categorical variables
² Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)

code

# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p4<-graph_effects_vert(datt)
p4 <- forestploter::edit_plot(p4,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p4 <- forestploter::insert_text(p4, text = "Colorectal Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p4 <- forestploter::insert_text(p4, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Fixed effects model (click to view)

code

my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race",   poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", 
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)

Characteristic	HR¹	95% CI¹	p-value
Age (in decades)	1.16	1.10, 1.23	<0.001
Race
NH White	—	—
Asian	1.13	0.87, 1.46	0.4
Black	1.09	0.86, 1.40	0.5
Hispanic	1.26	1.03, 1.53	0.022
Other	1.41	0.52, 3.81	0.5
Sex
female	—	—
male	1.10	0.95, 1.27	0.2
Poverty
<10% poverty	—	—
10%-19% poverty	0.97	0.80, 1.19	0.8
20%+ poverty	1.28	0.99, 1.67	0.061
Rurality
rural	—	—
urban	0.84	0.64, 1.11	0.2
Married
No	—	—
Yes	0.91	0.78, 1.05	0.2
Year of diagnosis
2013	—	—
2014	0.89	0.67, 1.20	0.5
2015	0.84	0.62, 1.12	0.2
2016	0.62	0.47, 0.83	0.001
2017	0.63	0.47, 0.84	0.002
2018	0.60	0.44, 0.81	<0.001
2019	0.33	0.23, 0.46	<0.001
Stage
2	—	—
0	1.58	0.63, 3.94	0.3
1	0.40	0.25, 0.65	<0.001
3	1.96	1.45, 2.64	<0.001
4	11.1	8.28, 15.0	<0.001
Surgery
No	—	—
Yes	0.53	0.45, 0.64	<0.001
Chemotherapy
No	—	—
Yes	1.16	0.88, 1.53	0.3
Radiation
No	—	—
Yes	1.06	0.82, 1.37	0.7
State
CA	—	—
GA	1.43	1.18, 1.73	<0.001
Colorectal type
colon	—	—
rectal	0.84	0.68, 1.03	0.10
Grade
2	—	—
1	0.85	0.64, 1.15	0.3
3	1.88	1.59, 2.21	<0.001
4	1.39	0.92, 2.08	0.12
Gene group
Negative	—	—
ATM	1.33	0.63, 2.81	0.5
BRCA 1/2	1.56	1.00, 2.44	0.048
CHEK2	0.99	0.49, 2.01	>0.9
Lynch Syndrome	0.58	0.43, 0.80	<0.001
Other PV	0.79	0.55, 1.13	0.2
PALB2	1.47	0.47, 4.62	0.5
VUS only	0.85	0.71, 1.03	0.10
¹ HR = Hazard Ratio, CI = Confidence Interval

Tests of heterogeneity

code

fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Pancreas

Sample description and missing data (click to view)

code

covar5<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage","surgcat","chemo","radiation","state","pvgroup")  #pancreas
can_type=5  # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)

my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Pancreas")  |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )

Table 1. Covariates - Pancreas
Characteristic¹	N = 1,861²
Age (in decades from mean age)	0.19 (-0.71, 0.89)
Race
NH White	1,224 (66%)
Asian	227 (12%)
Black	139 (7.5%)
Hispanic	261 (14%)
Other	10 (0.5%)
Sex
female	1,004 (54%)
male	857 (46%)
Poverty
<10% poverty	1,371 (74%)
10%-19% poverty	343 (18%)
20%+ poverty	147 (7.9%)
Rurality
rural	130 (7.0%)
urban	1,731 (93%)
Married	1,268 (68%)
Year of diagnosis
2013	50 (2.7%)
2014	65 (3.5%)
2015	98 (5.3%)
2016	172 (9.2%)
2017	200 (11%)
2018	395 (21%)
2019	881 (47%)
Stage
2	382 (22%)
0	3 (0.2%)
1	288 (16%)
3	320 (18%)
4	771 (44%)
missing	97
Surgery (Yes=1)	507 (27%)
Chemotherapy (Yes=1)	1,625 (87%)
Radiation (Yes=1)	177 (9.5%)
State
CA	1,518 (82%)
GA	343 (18%)
Gene group
Negative	1,032 (55%)
ATM	49 (2.6%)
BRCA 1/2	103 (5.5%)
CHEK2	23 (1.2%)
Lynch Syndrome	15 (0.8%)
Other PV	152 (8.2%)
PALB2	15 (0.8%)
VUS only	472 (25%)
¹ Omitted category listed first for categorical variables
² Median (Q1, Q3); n (%)

For pancreatic cancer the models stage 0 and 1 were collapsed given small sample sizes in those with Stage 0.

Model output

Random effects for gene group (as shown in figure 1)

code

mort$stage<-forcats::fct_collapse(mort$stage, "0-1" = c("0","1"))
# random effects
if(params$frailty=="on") {
    m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p5<-graph_effects_vert(datt)
p5 <- forestploter::edit_plot(p5,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p5 <- forestploter::insert_text(p5, text = "Pancreatic Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p5 <- forestploter::insert_text(p5, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Fixed effects model (click to view)

code

my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race",   poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", 
                pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)

Characteristic	HR¹	95% CI¹	p-value
Age (in decades)	1.13	1.07, 1.20	<0.001
Race
NH White	—	—
Asian	1.08	0.88, 1.33	0.5
Black	0.95	0.75, 1.21	0.7
Hispanic	1.19	0.98, 1.45	0.076
Other	1.55	0.73, 3.31	0.3
Sex
female	—	—
male	1.06	0.93, 1.20	0.4
Poverty
<10% poverty	—	—
10%-19% poverty	1.16	0.98, 1.38	0.091
20%+ poverty	1.38	1.08, 1.76	0.009
Rurality
rural	—	—
urban	1.14	0.89, 1.46	0.3
Married
No	—	—
Yes	0.89	0.78, 1.03	0.11
Year of diagnosis
2013	—	—
2014	0.82	0.54, 1.26	0.4
2015	0.69	0.46, 1.03	0.066
2016	0.67	0.46, 0.97	0.035
2017	0.61	0.42, 0.89	0.010
2018	0.58	0.41, 0.84	0.004
2019	0.44	0.31, 0.63	<0.001
Stage
2	—	—
0-1	0.86	0.67, 1.10	0.2
3	1.05	0.85, 1.31	0.6
4	1.52	1.25, 1.84	<0.001
Surgery
No	—	—
Yes	0.43	0.35, 0.52	<0.001
Chemotherapy
No	—	—
Yes	2.04	1.57, 2.64	<0.001
Radiation
No	—	—
Yes	0.87	0.70, 1.09	0.2
State
CA	—	—
GA	1.67	1.41, 1.96	<0.001
Gene group
Negative	—	—
ATM	0.87	0.58, 1.29	0.5
BRCA 1/2	0.68	0.52, 0.90	0.006
CHEK2	0.61	0.33, 1.15	0.13
Lynch Syndrome	0.32	0.12, 0.85	0.023
Other PV	1.12	0.89, 1.41	0.3
PALB2	1.03	0.51, 2.09	>0.9
VUS only	1.00	0.85, 1.16	>0.9
¹ HR = Hazard Ratio, CI = Confidence Interval

Tests of heterogeneity

code

fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Arrange plots

Extract legend from dummy plot

code

p_leg<-ggplot(
  datt, aes(x = ord, y = estimate, group = as.factor(type), 
            color = as.factor(type)) ) + 
  geom_point() + geom_line()  +
  scale_color_manual(values = c("#377eb8", "#4daf4a"), name="Estimates", 
                     labels=c("Fixed", "Random"),
                     guide = guide_legend(override.aes = list(size = 3,shape=22,
                                                              fill=c("#377eb8", "#4daf4a")) )
                     )  +
  theme(legend.key = element_rect(fill = "white"), 
        legend.text=element_text(size=12),legend.title=element_text(size=12))

extract_legend<-function(myggplot){
    tmp <- ggplot_gtable(ggplot_build(myggplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    return(legend)
}
my_legend<-extract_legend(p_leg)
#cowplot::plot_grid(p1,my_legend)

Plot 5 figures and legend together for Figure 1.

code

p_final<-cowplot::plot_grid(p1,p2,p3,p4,p5,my_legend, ncol=2,
                            align="AUTO")
save(p_final,file="data/p_final.Rdata")
ggplot2::ggsave(p_final, 
                filename = "final_plot.pdf", 
                path = "img/", 
                units = "in", width =11.6, height = 15, 
                dpi = 300, bg = "white",device=cairo_pdf)

References

Austin, Peter C. 2017. “A Tutorial on Multilevel Survival Analysis: Methods, Models and Applications.” International Statistical Review = Revue Internationale de Statistique 85 (2): 185. https://doi.org/10.1111/insr.12214.

Balan, Theodor Adrian, and Hein Putter. 2019. “frailtyEM : An R Package for Estimating Semiparametric Shared Frailty Models.” Journal of Statistical Software 90 (7). https://doi.org/10.18637/jss.v090.i07.

Balan, Theodor A., and Hein Putter. 2020. “A tutorial on frailty models.” Statistical Methods in Medical Research 29 (11): 3424–54. https://doi.org/10.1177/0962280220921889.

Elwert, Felix, and Christopher Winship. 2014. “Endogenous Selection Bias: The Problem of Conditioning on a Collider Variable.” Annual Review of Sociology 40 (1): 31–53. https://doi.org/10.1146/annurev-soc-071913-043455.

Hall, Michael J., Julia E. Reid, Lynn A. Burbidge, Dmitry Pruss, Amie M. Deffenbaugh, Cynthia Frye, Richard J. Wenstrup, Brian E. Ward, Thomas A. Scholl, and Walter W. Noll. 2009. “BRCA1 and BRCA2 Mutations in Women of Different Ethnicities Undergoing Testing for Hereditary Breast-Ovarian Cancer.” Cancer 115 (10): 2222. https://doi.org/10.1002/cncr.24200.

Kurian, Allison W., Paul Abrahamse, Allison Furgal, Kevin C. Ward, Ann S. Hamilton, Rachel Hodan, Rachel Tocco, et al. 2023. “Germline Genetic Testing After Cancer Diagnosis.” JAMA 330 (1): 43–51. https://doi.org/10.1001/jama.2023.9526.

Lipsky, Ari M., and Sander Greenland. 2022. “Causal Directed Acyclic Graphs.” JAMA, February. https://doi.org/10.1001/jama.2022.1816.

McElreath, Richard. 2020. Statistical Rethinking: A Bayesian Course with Examples in r and Stan. 2nd ed. CRC Texts in Statistical Science. Boca Raton: Taylor; Francis, CRC Press.

Pearl, Judea. 1995. “Causal Diagrams for Empirical Research.” Biometrika 82 (4): 669–88. https://doi.org/10.2307/2337329.

———. 2010. “An Introduction to Causal Inference.” The International Journal of Biostatistics 6 (2): 7. https://doi.org/10.2202/1557-4679.1203.

Therneau, Terry M., Thomas Lumley, Elizabeth Atkinson, and Crowson Cynthia. 2024. Survival: Survival Analysis. https://cran.r-project.org/web/packages/survival/index.html.