Main Analyses

Re-running frailty models currently set to off

Last run October 25, 2024 at 2:54PM-3:00PM (check modified date saved results)

# functions used for each cancer type
get_data<-function(cancer_type) {
  mort<-  haven::read_dta("data/mort.dta") |>    
    dplyr::filter(cancer_site1==cancer_type) |>  
    haven::as_factor() |>
    dplyr::mutate(stage = relevel(as.factor(stage), 3)) |>      # set base stage II
    dplyr::mutate(agedec = agedec-mean(agedec)) |>
    dplyr::mutate(pvgroup = relevel(as.factor(pvgroup), 5)) |>    # set base none
    dplyr::mutate(grade = relevel(as.factor(grade),2))  |>
    dplyr::mutate(race = relevel(race,4)) |>
    dplyr::mutate(rurality = relevel(rurality,1)) |>
    dplyr::mutate(poverty = relevel(poverty,3)) 
  return(mort)
}

get_fixed<-function(data,covar) {
  outcome<-"survival::Surv(survtime, event)"
  f<-as.formula(paste(outcome,paste(covar,collapse="+"),sep="~"))
  m_fixed<-survival::coxph(f, data, ties = "breslow")
  return(m_fixed)
}

get_random<-function(data,can_type,covar) {
  library(coxme)
  outcome<-"survival::Surv(survtime, event)"
  f<-as.formula(paste(outcome,paste(paste(covar[-length(covar)],collapse="+"),
                                    "cluster(pvgroup)", sep="+"),sep="~"))
  m_emfrail<-frailtyEM::emfrail(f, data)
  save_file<-paste0("data/","m_emfrail",can_type,".Rdata")
  save(m_emfrail,file=save_file)
  return(m_emfrail)
}

process_fixed<-function(data=m_fixed) {
  library(tidyverse)
  # Obtain center estimates and calculate standard error
  pvgroup <- grep("pvgroup", names(m_fixed$coefficients))
  nc <- length(pvgroup)
  var_pvgroup <- m_fixed$var[pvgroup, pvgroup]
  varmean <- sum(var_pvgroup)/ nc^2
  
  newse <- sqrt(c(varmean, diag(var_pvgroup) + varmean - 2/nc * apply(var_pvgroup, 1, sum)))
  # change contrast for hr's to average
  fe<-data.frame(beta = c(pvgroup5 = 0, m_fixed$coefficients[pvgroup]), sd = newse) |> 
    rownames_to_column("pvgroup") |> 
    mutate(pvgroup = substr(pvgroup,8,25)) |>
    mutate( pvgroup = base::replace(pvgroup, pvgroup=="5", "Negative"))  |>
    mutate(sbeta = sum(beta) / nc) |> 
    mutate(beta = beta - sbeta) |> 
    arrange(beta) |>
    mutate(ord = 1:n()) |> 
    mutate(pvgroup = as.character(pvgroup)) |> 
    mutate(ymin = beta - 1.96 * sd, ymax = beta + 1.96 * sd) |> 
    rename(id = pvgroup, estimate = beta) |> 
    select(-sd, -sbeta, -ord) |>
    mutate(type = "fixed effects") |> 
    mutate(ord = 1:n()) 
  return(fe)
}


process_random <- function(data=m_emfrail) {
  library(data.table)
  frailties<-as.data.table(frailtyEM:::summary.emfrail(data)[[12]])
  logz <- frailties[order(z),.(id,z,lower_q, upper_q)] |>
              dplyr::mutate_if(is.numeric, log) |> 
              dplyr::rename(estimate = z, ymin = lower_q, ymax = upper_q) |> 
              dplyr::mutate(type = "frailty") |>
              dplyr::mutate(ord = 1:dplyr::n())
return(logz)
}

combine_effects<-function(fe,logz) {
  ord_re <- data.frame(id = logz$id, ord = logz$ord)
  datt <- fe |>  
      right_join(ord_re,join_by(id==id)) |> 
      select(-ord.y) |> rename(ord=ord.x) |> 
      bind_rows(logz)
  return(datt)
}

table_fig <- function(tblname,size=1, tbl_title,type=can_type) {
  cancer_type<-c("breast cancer triple-negative","breast cancer her2+","breast cancer her2-,ER/PR+","colorectal","pancreas")
  gt::gt(dplyr::as_tibble(tblname,rownames="type")) |> 
    gt::tab_header(title = tbl_title,subtitle=cancer_type[type]) |>
    gt::gtsave(paste0("stats_",deparse(substitute(tblname)),
                      "_table",type,".png"), path = "img/")
  cowplot::ggdraw() + cowplot::draw_image(paste0("img/",
                                                 "stats_",deparse(substitute(tblname)),"_table",can_type,".png"), scale = size)
}

graph_effects_vert<-function(data=datt) {
  library(grid)
  library(data.table)
  datt_DT<-as.data.table(datt)
  datt_DT[,type :=(ifelse(type=="fixed effects","fe","re"))]
  
  dt<-dcast(datt_DT,
#            id + ord ~ type,
            id ~ type,
            value.var = c("estimate","ymin","ymax")) 
  dt$` ` <- paste(rep(" ", 35), collapse = " ")
  ests<-c("estimate_fe","estimate_re","ymin_fe","ymin_re","ymax_fe","ymax_re")
  dt[,(ests):=lapply(.SD, function(x) exp(x)),.SDcols = ests]
  
  #  form for estimates 1.2 (0.2, 2.9)\n1.6 (0.4, 3.5)
  dt$CI <- paste(sprintf("%.1f (%.1f, %.1f)", dt$estimate_fe, dt$ymin_fe, dt$ymax_fe),
                 sprintf("%.1f (%.1f, %.1f)", dt$estimate_re, dt$ymin_re, dt$ymax_re),
                 sep = "\n")
  setnames(dt, old = "id", new = "Gene")
  tm <- forestploter::forest_theme(base_size = 10,
                     refline_gp = gpar(lty="solid"),
                     ci_pch = c(15, 15),
                     ci_col = c("#377eb8", "#4daf4a"),
                     footnote_gp = gpar(col = "blue"),
                     legend_name = "Estimates",
                     legend_value = c("Fixed effects", "Random effects"),
                     legend_position = "none",
                     vertline_lty = c("dashed"),
                     vertline_col = c("#d6604d"),
                     # Table cell padding, width 4 and heights 3
                     core = list(padding = unit(c(4, 3), "mm")),
                     gp = gpar(fontface = "italic")
                     )
p<-forestploter::forest(dt[,c(1,8, 9)],
                          est = list(dt$estimate_fe,dt$estimate_re),
                          lower = list(dt$ymin_fe,dt$ymin_re), 
                          upper = list(dt$ymax_fe,dt$ymax_re),
                          ci_column = 2,
                          ref_line = 1, 
                          x_trans = "log",
                          ticks_at = c(0.25,0.5, 2,4),
                          arrow_lab = c("Lower risk", "Higher risk"),
                          nudge_y = 0.4,
                          xlim=c(.2,5),
                          theme=tm)
  return(p)
}
  
pvalr <- function(pvals, sig.limit = .001, digits = 3, html = FALSE) {
# https://stackoverflow.com/questions/23018256/printing-p-values-with-0-001
  roundr <- function(x, digits = 1) {
    res <- sprintf(paste0('%.', digits, 'f'), x)
    zzz <- paste0('0.', paste(rep('0', digits), collapse = ''))
    res[res == paste0('-', zzz)] <- zzz
    res
  }
  
  sapply(pvals, function(x, sig.limit) {
    if (x < sig.limit)
      if (html)
        return(sprintf('&lt; %s', format(sig.limit))) else
          return(sprintf('< %s', format(sig.limit)))
    if (x > .1)
      return(paste("=",roundr(x, digits = 2))) else
        return(paste("=",roundr(x, digits = digits)))
  }, sig.limit = sig.limit)
}

(The algorithms for these functions used in the analysis and some of the manipulation of the results were based on examples from (Theodor A. Balan and Putter 2020). Any errors identified in adapting them or in their application to our example are ours alone.)

Stata code used to create the analytic data set of all 5 cancers from file of SEER variables distributed by project data steward

code
use "data/mort imputed 1013.dta"
mi extract 0
* reorder to match Paul's outputs
recode cancer_sitegroup  (3=1 "breast, triple negative") ///
  (2=2 "breast, HER2+") (1=3 "breast, ER/PR+ HER2-") (4=4 "colorectal") ///
  (5=5 pancreas),gen(cancer_site1)
* surgcat has 5 categories should have y/n by Paul's outputs
gen surgcat=surg>0
recode sexrecode (1 9 =1 male) (2=0 female),gen(sex) label(male_sex)
label values sex male_sex
label var sex "Sex"
tostring yeardx,gen(year_str)
encode year_str,gen(yeardx_cat)
label define yn 1 "Yes" 0 "No"
label values married surgcat chemo radiation her2therapy yn
recode colon_rectum (1=0) (2=.) (3=1),gen(colon_rectal)
label define colon_rectal 1 rectal 0 colon 
label values colon_rectal colon_rectal
label var colon_rectal "Colorectal type"
compress
qui dtable i.(pvgroup race age_group poverty rurality married yeardx_cat stage ///
                surgcat chemo radiation state grade her2therapy colon_rectum) ///
              ,by(cancer_site1,nototals) export(table1.html,replace)
* switch to study window end 12/31/2021
stset, clear
stset time22, failure(death22)
gen survtime=_t
gen event=_d
save "data/mort",replace
de pvgroup

Overview of modeling strategy

We used directed acyclic graphs to justify the selection of variables for the analysis.(Pearl 1995; Lipsky and Greenland 2022)

The primary predictor of interest is the gene group describing the presence or absence of specific pathogenic variants. The outcome is mortality. In addition the following variable groups were included:

Covariate list

Clinical Treatment Shared ancestry Access and quality Selection into cohort
Age at diagnosis Chemotherapy Race Poverty ( county level) Genetic testing
Stage Surgery Rurality (county level)
Grade (cancer-specific) Radiation State
Rectal vs colon (cancer-specific) Receipt of HER2-therapy (cancer-specific)

As only people from the SEER registry who had genetic testing within 6 months of diagnosis were included, all of the results are conditioned on receipt of genetic testing even though this variable is not formally included in our analysis.

The goal of the analysis is to describe whether people who have one of several pathogenic variants (PVs) or gene groups would be predicted to have a different mortality hazard in comparison to the average hazard of people with the cancer. More specifically, for a person who has been diagnosed and is planning or undergoing treatment with a cancer of known stage and grade and then finds out that they have a PV, the pertinent question becomes whether this finding will have any implications for their mortality risk beyond what they already know about their diagnosis.

We could expect that the presence of a PV can potentially modify the age at which one gets cancer, the stage or grade at which it occurs and the treatment that is received. These clinical and treatment variables are then mediators between the presence of a PV and mortality as represented in the DAG below. However, given that a patient already knows that information, to answer the question described requires trying to estimate any direct effect of PV on mortality independent of any paths through the described mediators and any possible confounders of the relationship.

code
library(dagitty)
library(ggdag)
risk<-dagify(
  Y ~ PV + C + M + C2, PV ~ C, M ~ PV + C2, 
  coords =list(
    x=c(Y=7,C=3, PV=0, M =3 ,C2 = 7 ), 
    y=c(Y=0,C=-3,PV=0, M = 3, C2 = 3  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "PV-outcome \n Confounder" , Y=" Mortality", M = "Clinical/treatment Mediator",
           C2="Mediator-outcome \n confounder")
) 
myplot(risk)

We wish to block all the clinical/treatment mediators as well as any confounders of the PV - mortality relationship in order to test for whether there is any direct effect of PV on mortality. Unfortunately, once we adjust for the mediators we need to also account for any mediator - outcome and any PV - mediator confounders as well. If we do not then the correlations in that pathway will be included in our estimate of the PV - mortality relationship.(Elwert and Winship 2014)

In the more detailed DAG below we show the mediators whose paths we wish to block (age at diagnosis, stage/grade and the treatment received which includes chemotherapy, surgery and radiation). We also include the receipt of HER2-therapy as a disease specific treatment variable for breast cancer and Rectal vs Colon location for colorectal cancer.

code
risk<-dagify(
  Y ~  PV + M1 + M2 + M3 + C2 + R + YD, PV ~ R,
    M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
    M3 ~ M1 + M2 + YD, R ~ C, 
    GT ~ YD + M1 + M2 + C2 + R,  
  coords =list(
    x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7), 
    y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
           M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
           R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
) 

set.seed(7)
set.seed(9670)
myplot(risk)

There are not many confounders of the direct PV - mortality or PV - mediator relationships as in order to be a confounder it has to cause (requiring it to come before) the PV as well as the outcome. However, the most commonly raised potential confounder of gene outcome relationships is shared ancestry. The primary measure of shared ancestry that we have is the SEER race/ethnicity variable. Of the gene groups we have BRCA1/2 is the only one with an obvious shared ancestry group of Ashkenazi Jews who share a founder mutation. However, apart from this one group it seems that BRCA1/2 prevalence is not uncommon and quite similar across race and ethnic groups(Hall et al. 2009) and the latter seems true for the other gene groups. But we decided to still include this race variable as as a proxy for shared ancestry.

Again, the biggest problem with estimating the direct effect of PV on mortality is that once we have included the mediators we have to now block all the potential mediator-outcome confounders labeled collectively in the green box in the DAG. For example, given that we hypothesize that PVs may cause treatment changes, then any confounder of the treatment -> mortality relationship, such as access or quality of care poverty which limits access to treatment, opens up a collider path with respect to the PV mortality relationship. Inclusion of treatment as a collider, while it blocks the mediating path from PV -> Treatment -> Mortality, it will open a path from PV -> treatment <- Poverty -> Mortality. Thus we include the county level variables poverty, rurality as well as state as a proxy for quality and access related potential mediator-outcome confounders.

A second potential mediator-outcome confounder is the year of diagnosis which can cause treatment changes if cancer therapy changes and also might have an effect on outcome due to other changes in supportive care occur over time. As such it also has to be blocked to avoid a backdoor path from the treatment mediator to the outcome which would bias the PV - mortality relationship. An interesting feature of year of diagnosis is that selection into our cohort changed quite a bit from 2013-2019 as genetic testing rates increased significantly.(Kurian et al. 2023) Thus year of diagnosis is causally related to genetic testing. Given the additional causal pathways from the clinical characteristics of the cancer and race to genetic testing, a pathway opens up from year of diagnosis to mortality as the implied genetic testing covariate is a collider on this pathway.(Elwert and Winship 2014) This explains an odd finding seen in the regression results which is a large decrease in the hazard ratio for later years of diagnosis as genetic testing spreads from the highest risk groups to lower risk groups and the cohorts include an increasing number of people with lower clinical risk.

People do not get into the cohort unless they have a genetic test. This automatically includes genetic testing as a covariate whether we want it or not. Genetic testing is more common in higher risk cancers (early age of diagnosis or high stage and grade) and also potentially causally related to race/ethnicity as well as some of the access and quality variables and the relationships are changing over time. The concern of the selection mechanism would be if it introduced any backdoor paths between PV and mortality. However the DAG is getting a bit complicated to for us to easily detect this at this point.

Fortunately, once we have encoded the causal relationships we propose exist we can use functions that apply an axiomatic system for replacing probability formulas containing the causal relationships with ordinary conditional probabilities. That allows us to identify whether the adjustment set is able to evaluate a specific relationship without bias. This axiomatic system has the name ‘do-calculus’ invented by Judea Pearl.(Pearl 2010) Below we can show that if we condition on the variables in the DAG above the relationship between PV and mortality are d-separated which would allow us to estimate the presence or absence of a direct effect.

code
risk1<-dagify(
  Y ~   M1 + M2 + M3 + C2 + R + YD, PV ~ R,
    M1 ~PV + C2, M2 ~ PV + C2, M3 ~ PV + C2,
    M3 ~ M1 + M2 + YD, R ~ C, 
    GT ~ YD + M1 + M2 + C2 + R,  
  coords =list(
    x=c(Y=7,C=1.5, R = 1.5, PV=0, M1 = 1.5 , M2 = 2.5 ,M3 = 3,C2 = 7,YD= 5 ,GT = 7), 
    y=c(Y=0,C=-3.5,R = -1.5, PV=0, M1 = 1, M2=3 , M3= 6 , C2 = 3, YD=6, GT = 5  )
    ),
  exposure="PV",
  outcome="Y",
  labels=c(PV="PV", C= "Shared Ancestry" , Y=" Mortality", M1 = "Age at diagnosis",
           M2 = "Stage/grade", M3="Treatment", C2="Mediator-outcome \n confounders",
           R = "race/ethnicity", YD = "year of diagnosis", GT = "gene testing")
) 
set.seed(3920)
ggdag::ggdag_dseparated(
  risk1,
  from = "PV",
  to = "Y",
  controlling_for = c("M1", "M2", "M3","R","C2","GT","YD"),
  text = FALSE,
  use_labels  = "label") +
  theme_dag() +
  labs(title = "If we add the confounders of all the mediators then PV and Y \n are independent as they should be if there is no direct path")

Based on this we can argue that we may be able to identify the direct effect of PV on mortality. However, in case it is not obvious the two biggest assumptions in the selection of covariates with respect to identifying the PV - mortality relationship are as follows:

  1. There are no unmeasured confounders beyond race that capture important shared ancestry features that would cause both the PV and higher mortality.
  2. The covariate set of county level variables poverty, rurality as well as state are an adequate proxy for the quality and access related potential mediator-outcome confounders.

Although not always clear, both of these assumptions are required for all gene outcome studies if they condition on any treatment or clinical severity variables, not just ours. However, we believe proposing a specific DAG provides a principled way of justifying the selection of covariates and allows us to start to try to identify the key variable sets for making these estimates. Perhaps most importantly it offers a transparent way for others to critique our choices and to propose and defend through additional analyses the need for other covariates.

Fixed effects model

The fixed effects modeled are estimated using the coxph procedure from the Survival package.(Therneau et al. 2024)

Use of multilevel models to estimate gene group random effects

The implementation of multilevel models in survival data uses the term frailty models for a survival regression model that includes random effects. The term is often specifically used for those models that only include random effects for the intercept.(Austin 2017) When the random effects are shared by a cluster the models are known as shared frailty models. Given the mathematical form of the survival model, the random intercept in the linear predictor has a multiplicative effect on the hazard function.

For our purposes we could propose a data-generating hypothesis for the use of multilevel models that involves viewing the pathogenic variants that have been found in our population as a random sample of all the possible genetic variations that can have an effect on the mortality of people who develop cancer. However, in this case we are primarily using a multilevel model as a strategy to avoid over-fitting. Given that we are fitting 8 gene groups over 5 cancers there are a total of 40 estimates of interest. There is heterogeneity in the size of the gene groups with some quite small. Multilevel modeling can be thought of “as adaptive regularization where the model itself tries to learn how skeptical it should be.”(McElreath 2020) In this sense it has some similarities to other strategies to avoid over-fitting such as ridge regression. The gamma distribution acts as a regularizing prior for the gene effects although in this case the parameters for that prior are learned from the data.

For estimation we used the profile EM (expectation-maximization) algorithm as implemented in the FrailtyEM package.(Theodor Adrian Balan and Putter 2019)

Breast Cancer Models

Triple negative

code
can_type=1
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type) 
my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", 
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                her2therapy ~ "HER2 treatment (Yes=1)", grade ~ "Grade", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Triple negative") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )
Table 1. Covariates - Triple negative
Characteristic1 N = 11,8422
Age (in decades from mean age) -0.05 (-0.85, 0.75)
Race
    NH White 6,061 (51%)
    Asian 1,105 (9.3%)
    Black 2,078 (18%)
    Hispanic 2,417 (20%)
    Other 181 (1.5%)
Poverty
    <10% poverty 8,100 (68%)
    10%-19% poverty 2,540 (21%)
    20%+ poverty 1,202 (10%)
Rurality
    rural 788 (6.7%)
    urban 11,054 (93%)
Married 6,974 (59%)
Year of diagnosis
    2013 1,348 (11%)
    2014 1,425 (12%)
    2015 1,606 (14%)
    2016 1,529 (13%)
    2017 1,668 (14%)
    2018 1,974 (17%)
    2019 2,292 (19%)
Stage
    2 4,022 (37%)
    0 1,671 (16%)
    1 2,986 (28%)
    3 1,660 (15%)
    4 414 (3.9%)
    missing 1,089
Surgery (Yes=1) 7,751 (65%)
Chemotherapy (Yes=1) 8,295 (70%)
Radiation (Yes=1) 1,488 (13%)
State
    CA 8,640 (73%)
    GA 3,202 (27%)
HER2 treatment (Yes=1) 318 (2.7%)
Grade
    2 1,890 (18%)
    1 345 (3.2%)
    3 8,287 (78%)
    4 105 (1.0%)
    missing 1,215
Gene group
    Negative 7,748 (65%)
    ATM 36 (0.3%)
    BRCA 1/2 1,322 (11%)
    CHEK2 69 (0.6%)
    Lynch Syndrome 48 (0.4%)
    Other PV 379 (3.2%)
    PALB2 128 (1.1%)
    VUS only 2,112 (18%)
1 Omitted category listed first for categorical variables
2 Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p1<-graph_effects_vert(datt)
p1 <- forestploter::edit_plot(p1,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p1 <- forestploter::insert_text(p1, text = "Triple-Negative Breast Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",
               round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p1 <- forestploter::insert_text(p1, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p1,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Gene ATM BRCA 1/2 CI CHEK2 Lynch Syndrome Negative Other PV PALB2 VUS only 3.7 (1.8, 7.7) 1.2 (0.8, 1.6) 0.6 (0.4, 0.9) 0.7 (0.6, 0.8) 0.3 (0.1, 1.2) 0.9 (0.6, 1.3) 2.0 (0.9, 4.4) 1.1 (0.7, 1.5) 1.0 (0.8, 1.3) 1.1 (1.0, 1.2) 0.9 (0.6, 1.3) 1.0 (0.7, 1.3) 0.7 (0.4, 1.3) 0.9 (0.7, 1.3) 1.0 (0.7, 1.3) 1.1 (0.9, 1.2) 0.25 0.5 2 4 Lower risk Higher risk Triple-Negative Breast Cancer Likelihood ratio test for heterogeneity Χ² = 16.19, p< 0.001

code
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)
Characteristic HR1 95% CI1 p-value
Age (in decades from mean age) 1.00 0.95, 1.05 >0.9
Race


    NH White
    Asian 0.71 0.55, 0.92 0.010
    Black 1.11 0.95, 1.29 0.2
    Hispanic 1.09 0.93, 1.27 0.3
    Other 0.38 0.14, 1.03 0.057
Poverty


    <10% poverty
    10%-19% poverty 0.91 0.77, 1.07 0.2
    20%+ poverty 1.09 0.90, 1.33 0.4
Rurality


    rural
    urban 0.87 0.71, 1.07 0.2
Married


    No
    Yes 0.97 0.86, 1.09 0.6
Year of diagnosis


    2013
    2014 1.16 0.95, 1.41 0.2
    2015 1.14 0.94, 1.38 0.2
    2016 0.89 0.72, 1.09 0.3
    2017 0.87 0.70, 1.07 0.2
    2018 0.57 0.46, 0.72 <0.001
    2019 0.44 0.34, 0.57 <0.001
Stage


    2
    0 0.07 0.04, 0.13 <0.001
    1 0.37 0.30, 0.45 <0.001
    3 3.67 3.20, 4.20 <0.001
    4 11.3 9.49, 13.4 <0.001
Surgery (Yes=1)


    No
    Yes 1.60 1.37, 1.86 <0.001
Chemotherapy (Yes=1)


    No
    Yes 0.81 0.67, 0.99 0.035
Radiation (Yes=1)


    No
    Yes 1.60 1.33, 1.94 <0.001
State


    CA
    GA 1.25 1.07, 1.45 0.004
HER2 treatment (Yes=1)


    No
    Yes 1.14 0.86, 1.51 0.4
Grade


    2
    1 0.79 0.47, 1.32 0.4
    3 0.95 0.81, 1.11 0.5
    4 0.81 0.38, 1.73 0.6
Gene group


    Negative
    ATM 3.66 1.63, 8.21 0.002
    BRCA 1/2 0.62 0.51, 0.75 <0.001
    CHEK2 0.34 0.08, 1.38 0.13
    Lynch Syndrome 1.97 0.81, 4.75 0.13
    Other PV 0.89 0.62, 1.27 0.5
    PALB2 0.73 0.40, 1.33 0.3
    VUS only 0.99 0.84, 1.15 0.9
1 HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Breast HER2+

code
covar2<-covar1
can_type=2
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort <-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - HER2+") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )
Table 1. Covariates - HER2+
Characteristic1 N = 9,9572
Age (in decades from mean age) -0.24 (-0.84, 0.76)
Race
    NH White 5,453 (55%)
    Asian 1,346 (14%)
    Black 1,149 (12%)
    Hispanic 1,928 (19%)
    Other 81 (0.8%)
Poverty
    <10% poverty 7,011 (70%)
    10%-19% poverty 2,054 (21%)
    20%+ poverty 892 (9.0%)
Rurality
    rural 612 (6.1%)
    urban 9,345 (94%)
Married 6,244 (63%)
Year of diagnosis
    2013 1,066 (11%)
    2014 1,182 (12%)
    2015 1,435 (14%)
    2016 1,442 (14%)
    2017 1,477 (15%)
    2018 1,556 (16%)
    2019 1,799 (18%)
Stage
    2 3,508 (37%)
    0 276 (2.9%)
    1 3,992 (42%)
    3 1,173 (12%)
    4 547 (5.8%)
    missing 461
Surgery (Yes=1) 6,812 (68%)
Chemotherapy (Yes=1) 8,317 (84%)
Radiation (Yes=1) 1,018 (10%)
State
    CA 7,400 (74%)
    GA 2,557 (26%)
HER2 treatment (Yes=1) 7,087 (71%)
Grade
    2 3,639 (38%)
    1 455 (4.8%)
    3 5,349 (56%)
    4 36 (0.4%)
    missing 478
Gene group
    Negative 6,829 (69%)
    ATM 108 (1.1%)
    BRCA 1/2 351 (3.5%)
    CHEK2 209 (2.1%)
    Lynch Syndrome 25 (0.3%)
    Other PV 300 (3.0%)
    PALB2 45 (0.5%)
    VUS only 2,090 (21%)
1 Omitted category listed first for categorical variables
2 Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
    m_emfrail<-get_random(mort,can_type,covar)  #random w. Lynch, fixed w.out
}

# fixed effects
levels(mort$pvgroup)<-c("Negative","ATM","BRCA 1/2","CHEK2","Other PV","Other PV","PALB2","VUS only") # Lynch(4) combined w other(6)
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)  |>
      mutate(type= replace_na(type,"fixed effects"))
p2<-graph_effects_vert(datt)
p2 <- forestploter::edit_plot(p2,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p2 <- forestploter::insert_text(p2, text = expression(bold(paste(bolditalic("HER2"),"-Positive Breast Cancer"))),
              part = "header",
              col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p2 <- forestploter::insert_text(p2, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p2,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Gene ATM BRCA 1/2 CI CHEK2 Lynch Syndrome Negative Other PV PALB2 VUS only 0.8 (0.2, 2.7) 1.0 (0.7, 1.2) 1.9 (1.0, 3.6) 1.0 (0.8, 1.3) 0.3 (0.0, 1.4) 0.9 (0.7, 1.2) NA (NA, NA) 1.0 (0.7, 1.3) 1.9 (1.1, 3.1) 1.2 (1.0, 1.3) 1.0 (0.4, 2.2) 1.0 (0.7, 1.2) 1.7 (0.5, 6.0) 1.0 (0.8, 1.3) 1.5 (0.9, 2.5) 1.0 (0.8, 1.1) 0.25 0.5 2 4 Lower risk Higher risk HER2 -Positive Breast Cancer Likelihood ratio test for heterogeneity Χ² = 0.89, p= 0.17

The fixed effects cox model does not converge to stable estimates for the pvgroup when the Lynch genes are included. There are relatively few observations (25) in those with HER2+ breast cancer. Thus Lynch was collapsed into the “Other PV” category for the fixed effects analysis. That is why the estimate is missing in the above figure.

code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)
Characteristic HR1 95% CI1 p-value
Age (in decades) 1.10 1.01, 1.19 0.028
Race


    NH White
    Asian 1.09 0.80, 1.50 0.6
    Black 1.42 1.08, 1.88 0.013
    Hispanic 1.29 0.99, 1.69 0.057
    Other 0.86 0.21, 3.47 0.8
Poverty


    <10% poverty
    10%-19% poverty 0.99 0.75, 1.30 >0.9
    20%+ poverty 1.75 1.27, 2.42 <0.001
Rurality


    rural
    urban 1.30 0.84, 2.01 0.2
Married


    No
    Yes 0.78 0.64, 0.94 0.011
Year of diagnosis


    2013
    2014 1.02 0.76, 1.37 >0.9
    2015 0.83 0.61, 1.13 0.2
    2016 0.89 0.64, 1.23 0.5
    2017 0.81 0.56, 1.17 0.3
    2018 0.65 0.41, 1.02 0.060
    2019 0.67 0.38, 1.17 0.2
Stage


    2
    0 0.08 0.02, 0.32 <0.001
    1 0.34 0.24, 0.49 <0.001
    3 3.01 2.36, 3.86 <0.001
    4 8.84 6.82, 11.5 <0.001
Surgery


    No
    Yes 1.48 1.12, 1.94 0.005
Chemotherapy


    No
    Yes 0.56 0.41, 0.77 <0.001
Radiation


    No
    Yes 1.42 1.06, 1.92 0.020
State


    CA
    GA 1.54 1.19, 1.99 <0.001
HER2 treatment


    No
    Yes 0.66 0.52, 0.83 <0.001
Grade


    2
    1 1.07 0.58, 1.95 0.8
    3 1.41 1.13, 1.75 0.002
    4 0.83 0.11, 6.02 0.9
Gene group


    Negative
    ATM 0.43 0.11, 1.72 0.2
    BRCA 1/2 1.03 0.64, 1.67 0.9
    CHEK2 0.14 0.02, 1.02 0.052
    Other PV 0.53 0.24, 1.20 0.13
    PALB2 0.93 0.23, 3.80 >0.9
    VUS only 0.79 0.60, 1.04 0.090
1 HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Breast ER/PR+, HER2-

code
covar3<-covar1
can_type=3  # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)
# mylabels as defined above for triple negative dz
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - ER/PR+,HER2-") |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )
Table 1. Covariates - ER/PR+,HER2-
Characteristic1 N = 48,4732
Age (in decades) -0.19 (-0.99, 0.91)
Race
    NH White 29,727 (61%)
    Asian 5,827 (12%)
    Black 4,697 (9.7%)
    Hispanic 7,751 (16%)
    Other 471 (1.0%)
Poverty
    <10% poverty 34,965 (72%)
    10%-19% poverty 9,672 (20%)
    20%+ poverty 3,836 (7.9%)
Rurality
    rural 2,733 (5.6%)
    urban 45,740 (94%)
Married 30,684 (63%)
Year of diagnosis
    2013 4,811 (9.9%)
    2014 5,328 (11%)
    2015 6,039 (12%)
    2016 6,325 (13%)
    2017 6,827 (14%)
    2018 8,377 (17%)
    2019 10,766 (22%)
Stage
    2 10,033 (21%)
    0 8,001 (17%)
    1 24,382 (52%)
    3 3,350 (7.1%)
    4 1,172 (2.5%)
    missing 1,535
Surgery 25,282 (52%)
Chemotherapy 15,340 (32%)
Radiation 16,196 (33%)
State
    CA 36,878 (76%)
    GA 11,595 (24%)
HER2 treatment 747 (1.5%)
Grade
    2 23,404 (50%)
    1 12,285 (26%)
    3 11,262 (24%)
    4 209 (0.4%)
    missing 1,313
Gene group
    Negative 33,618 (69%)
    ATM 399 (0.8%)
    BRCA 1/2 1,975 (4.1%)
    CHEK2 723 (1.5%)
    Lynch Syndrome 206 (0.4%)
    Other PV 1,398 (2.9%)
    PALB2 343 (0.7%)
    VUS only 9,811 (20%)
1 Omitted category listed first for categorical variables
2 Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)
code
# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
#plot(graph_effects_vert(datt)) 
p3<-graph_effects_vert(datt)
p3 <- forestploter::edit_plot(p3,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p3 <- forestploter::insert_text(p3, text = expression(bold(paste("ER/PR-Positive",bolditalic(" HER2"),"-Negative Breast Cancer"))),
              part = "header",
              col = 1:3)
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p3 <- forestploter::insert_text(p3, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p3,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Gene ATM BRCA 1/2 CI CHEK2 Lynch Syndrome Negative Other PV PALB2 VUS only 0.7 (0.4, 1.2) 1.0 (1.0, 1.0) 1.1 (0.9, 1.4) 1.0 (1.0, 1.0) 0.8 (0.5, 1.3) 1.0 (1.0, 1.0) 1.8 (1.0, 3.1) 1.0 (1.0, 1.0) 0.9 (0.8, 1.1) 1.0 (1.0, 1.0) 1.0 (0.7, 1.3) 1.0 (1.0, 1.0) 1.1 (0.7, 1.7) 1.0 (1.0, 1.0) 0.9 (0.7, 1.0) 1.0 (1.0, 1.0) 0.25 0.5 2 4 Lower risk Higher risk ER/PR-Positive HER2 -Negative Breast Cancer Likelihood ratio test for heterogeneity Χ² = 0, p= 0.50

code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race", poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", her2therapy ~ "HER2 treatment",
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)
Characteristic HR1 95% CI1 p-value
Age (in decades) 1.24 1.19, 1.30 <0.001
Race


    NH White
    Asian 0.91 0.76, 1.10 0.4
    Black 1.40 1.20, 1.63 <0.001
    Hispanic 1.14 0.98, 1.32 0.082
    Other 0.90 0.45, 1.80 0.8
Poverty


    <10% poverty
    10%-19% poverty 1.20 1.04, 1.38 0.012
    20%+ poverty 1.51 1.26, 1.81 <0.001
Rurality


    rural
    urban 1.0 0.81, 1.22 >0.9
Married


    No
    Yes 0.79 0.71, 0.87 <0.001
Year of diagnosis


    2013
    2014 0.82 0.70, 0.96 0.016
    2015 0.82 0.70, 0.97 0.017
    2016 0.76 0.64, 0.90 0.002
    2017 0.60 0.49, 0.73 <0.001
    2018 0.60 0.48, 0.75 <0.001
    2019 0.53 0.41, 0.68 <0.001
Stage


    2
    0 0.14 0.10, 0.20 <0.001
    1 0.52 0.44, 0.61 <0.001
    3 2.95 2.58, 3.38 <0.001
    4 12.2 10.6, 14.2 <0.001
Surgery


    No
    Yes 1.46 1.28, 1.67 <0.001
Chemotherapy


    No
    Yes 1.30 1.14, 1.49 <0.001
Radiation


    No
    Yes 0.95 0.82, 1.10 0.5
State


    CA
    GA 1.54 1.35, 1.76 <0.001
HER2 treatment


    No
    Yes 1.01 0.79, 1.29 >0.9
Grade


    2
    1 0.64 0.53, 0.77 <0.001
    3 2.17 1.95, 2.42 <0.001
    4 1.02 0.32, 3.18 >0.9
Gene group


    Negative
    ATM 0.75 0.39, 1.44 0.4
    BRCA 1/2 1.23 1.03, 1.47 0.025
    CHEK2 0.91 0.58, 1.44 0.7
    Lynch Syndrome 1.97 1.05, 3.68 0.034
    Other PV 1.07 0.77, 1.50 0.7
    PALB2 1.20 0.73, 1.98 0.5
    VUS only 0.94 0.81, 1.09 0.4
1 HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity
code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Colorectal

code
covar4<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage",
          "surgcat","chemo","radiation","state","colon_rectal","grade","pvgroup")  #colorectal
can_type=4   
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type) 

my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                colon_rectal ~ "Colorectal (Rectal=1)", grade ~ "Grade", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Colorectal")  |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )
Table 1. Covariates - Colorectal
Characteristic1 N = 5,8222
Age (in decades from mean age) -0.20 (-1.00, 1.00)
Race
    NH White 3,396 (58%)
    Asian 589 (10%)
    Black 534 (9.2%)
    Hispanic 1,244 (21%)
    Other 59 (1.0%)
Sex
    female 3,148 (54%)
    male 2,674 (46%)
Poverty
    <10% poverty 3,915 (67%)
    10%-19% poverty 1,315 (23%)
    20%+ poverty 592 (10%)
Rurality
    rural 399 (6.9%)
    urban 5,423 (93%)
Married 3,531 (61%)
Year of diagnosis
    2013 366 (6.3%)
    2014 509 (8.7%)
    2015 638 (11%)
    2016 787 (14%)
    2017 941 (16%)
    2018 1,103 (19%)
    2019 1,478 (25%)
Stage
    2 1,377 (25%)
    0 146 (2.6%)
    1 1,100 (20%)
    3 1,770 (32%)
    4 1,143 (21%)
    missing 286
Surgery (Yes=1) 4,798 (82%)
Chemotherapy (Yes=1) 3,344 (57%)
Radiation (Yes=1) 834 (14%)
State
    CA 4,483 (77%)
    GA 1,339 (23%)
Colorectal (Rectal=1)
    colon 4,148 (71%)
    rectal 1,674 (29%)
Grade
    2 3,609 (68%)
    1 722 (14%)
    3 881 (17%)
    4 103 (1.9%)
    missing 507
Gene group
    Negative 3,468 (60%)
    ATM 46 (0.8%)
    BRCA 1/2 88 (1.5%)
    CHEK2 48 (0.8%)
    Lynch Syndrome 661 (11%)
    Other PV 308 (5.3%)
    PALB2 9 (0.2%)
    VUS only 1,194 (21%)
1 Omitted category listed first for categorical variables
2 Median (Q1, Q3); n (%)

Model output

Random effects for gene group (as shown in figure 1)

code
# random effects
if(params$frailty=="on") {
  m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p4<-graph_effects_vert(datt)
p4 <- forestploter::edit_plot(p4,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p4 <- forestploter::insert_text(p4, text = "Colorectal Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p4 <- forestploter::insert_text(p4, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p4,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Gene ATM BRCA 1/2 CI CHEK2 Lynch Syndrome Negative Other PV PALB2 VUS only 1.3 (0.7, 2.5) 1.1 (0.7, 1.4) 1.5 (1.0, 2.4) 1.2 (0.9, 1.6) 1.0 (0.5, 1.8) 1.0 (0.7, 1.4) 0.6 (0.4, 0.8) 0.7 (0.6, 0.9) 1.0 (0.8, 1.3) 1.1 (1.0, 1.2) 0.8 (0.5, 1.1) 0.9 (0.7, 1.2) 1.4 (0.5, 3.9) 1.0 (0.7, 1.4) 0.8 (0.6, 1.1) 0.9 (0.8, 1.1) 0.25 0.5 2 4 Lower risk Higher risk Colorectal Cancer Likelihood ratio test for heterogeneity Χ² = 4.47, p= 0.017

code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race",   poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", 
                grade ~ "Grade", pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)
Characteristic HR1 95% CI1 p-value
Age (in decades) 1.16 1.10, 1.23 <0.001
Race


    NH White
    Asian 1.13 0.87, 1.46 0.4
    Black 1.09 0.86, 1.40 0.5
    Hispanic 1.26 1.03, 1.53 0.022
    Other 1.41 0.52, 3.81 0.5
Sex


    female
    male 1.10 0.95, 1.27 0.2
Poverty


    <10% poverty
    10%-19% poverty 0.97 0.80, 1.19 0.8
    20%+ poverty 1.28 0.99, 1.67 0.061
Rurality


    rural
    urban 0.84 0.64, 1.11 0.2
Married


    No
    Yes 0.91 0.78, 1.05 0.2
Year of diagnosis


    2013
    2014 0.89 0.67, 1.20 0.5
    2015 0.84 0.62, 1.12 0.2
    2016 0.62 0.47, 0.83 0.001
    2017 0.63 0.47, 0.84 0.002
    2018 0.60 0.44, 0.81 <0.001
    2019 0.33 0.23, 0.46 <0.001
Stage


    2
    0 1.58 0.63, 3.94 0.3
    1 0.40 0.25, 0.65 <0.001
    3 1.96 1.45, 2.64 <0.001
    4 11.1 8.28, 15.0 <0.001
Surgery


    No
    Yes 0.53 0.45, 0.64 <0.001
Chemotherapy


    No
    Yes 1.16 0.88, 1.53 0.3
Radiation


    No
    Yes 1.06 0.82, 1.37 0.7
State


    CA
    GA 1.43 1.18, 1.73 <0.001
Colorectal type


    colon
    rectal 0.84 0.68, 1.03 0.10
Grade


    2
    1 0.85 0.64, 1.15 0.3
    3 1.88 1.59, 2.21 <0.001
    4 1.39 0.92, 2.08 0.12
Gene group


    Negative
    ATM 1.33 0.63, 2.81 0.5
    BRCA 1/2 1.56 1.00, 2.44 0.048
    CHEK2 0.99 0.49, 2.01 >0.9
    Lynch Syndrome 0.58 0.43, 0.80 <0.001
    Other PV 0.79 0.55, 1.13 0.2
    PALB2 1.47 0.47, 4.62 0.5
    VUS only 0.85 0.71, 1.03 0.10
1 HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity

code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Pancreas

code
covar5<-c("agedec","race","sex","poverty","rurality","married","yeardx_cat","stage","surgcat","chemo","radiation","state","pvgroup")  #pancreas
can_type=5  # use this code for cancer_types 2,3,5
covar<-eval(as.name(paste0("covar",can_type,sep="")))
mort<-get_data(cancer_type=can_type)

my_labels<-list(agedec ~ "Age (in decades from mean age)", race ~ "Race", sex ~ "Sex",
                poverty ~ "Poverty", rurality ~ "Rurality", married ~ "Married", 
                yeardx_cat ~ "Year of diagnosis", stage ~ "Stage", 
                surgcat ~ "Surgery (Yes=1)", chemo ~ "Chemotherapy (Yes=1)", 
                radiation ~ "Radiation (Yes=1)", state~ "State", 
                pvgroup ~ "Gene group")
gtsummary::tbl_summary(mort,label=my_labels, missing_text="missing",include=covar) |>
    gtsummary::modify_caption("Table 1. Covariates - Pancreas")  |>
    gtsummary::modify_footnote(label = "Omitted category listed first for categorical variables" )
Table 1. Covariates - Pancreas
Characteristic1 N = 1,8612
Age (in decades from mean age) 0.19 (-0.71, 0.89)
Race
    NH White 1,224 (66%)
    Asian 227 (12%)
    Black 139 (7.5%)
    Hispanic 261 (14%)
    Other 10 (0.5%)
Sex
    female 1,004 (54%)
    male 857 (46%)
Poverty
    <10% poverty 1,371 (74%)
    10%-19% poverty 343 (18%)
    20%+ poverty 147 (7.9%)
Rurality
    rural 130 (7.0%)
    urban 1,731 (93%)
Married 1,268 (68%)
Year of diagnosis
    2013 50 (2.7%)
    2014 65 (3.5%)
    2015 98 (5.3%)
    2016 172 (9.2%)
    2017 200 (11%)
    2018 395 (21%)
    2019 881 (47%)
Stage
    2 382 (22%)
    0 3 (0.2%)
    1 288 (16%)
    3 320 (18%)
    4 771 (44%)
    missing 97
Surgery (Yes=1) 507 (27%)
Chemotherapy (Yes=1) 1,625 (87%)
Radiation (Yes=1) 177 (9.5%)
State
    CA 1,518 (82%)
    GA 343 (18%)
Gene group
    Negative 1,032 (55%)
    ATM 49 (2.6%)
    BRCA 1/2 103 (5.5%)
    CHEK2 23 (1.2%)
    Lynch Syndrome 15 (0.8%)
    Other PV 152 (8.2%)
    PALB2 15 (0.8%)
    VUS only 472 (25%)
1 Omitted category listed first for categorical variables
2 Median (Q1, Q3); n (%)

For pancreatic cancer the models stage 0 and 1 were collapsed given small sample sizes in those with Stage 0.

Model output

Random effects for gene group (as shown in figure 1)

code
mort$stage<-forcats::fct_collapse(mort$stage, "0-1" = c("0","1"))
# random effects
if(params$frailty=="on") {
    m_emfrail<-get_random(mort,can_type,covar)
}

# fixed effects
m_fixed<-get_fixed(mort,covar)

# graph
## process fixed effects
fe<-process_fixed(m_fixed)
## process random effects
file_name=paste("data/","m_emfrail",can_type,".Rdata",sep="")
load(file_name)
logz<-process_random(m_emfrail)
## combine and graph
datt<-combine_effects(fe,logz)
p5<-graph_effects_vert(datt)
p5 <- forestploter::edit_plot(p5,
               col = 1,
               row = c(1:3,7),
               gp = gpar(fontface = "italic"))
p5 <- forestploter::insert_text(p5, text = "Pancreatic Cancer",
              part = "header",
              col = 1:3,
              gp = gpar(fontface = "bold"))
my_pval<-1/2*pchisq(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],1, lower.tail=FALSE)
my_chi<-paste0("Likelihood ratio test for heterogeneity \U{03A7}\U{00B2} = ",round(frailtyEM:::summary.emfrail(m_emfrail)$loglik[[3]],2),", p",pvalr(my_pval))
p5 <- forestploter::insert_text(p5, text = my_chi,
   part = "header", just = "right",
   col = 3, gp = gpar(fontsize=10) )

ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","svg",sep=""))
ggsave(p5,filename=paste("img/","m_emfrail",can_type,".","pdf",sep=""),device=cairo_pdf)

Gene ATM BRCA 1/2 CI CHEK2 Lynch Syndrome Negative Other PV PALB2 VUS only 1.2 (0.8, 1.7) 1.0 (0.8, 1.2) 0.9 (0.7, 1.2) 0.8 (0.7, 1.0) 0.8 (0.5, 1.4) 0.9 (0.7, 1.2) 0.4 (0.2, 1.0) 0.9 (0.6, 1.1) 1.3 (1.1, 1.7) 1.1 (1.0, 1.2) 1.5 (1.1, 2.0) 1.2 (1.0, 1.4) 1.4 (0.7, 2.6) 1.0 (0.8, 1.3) 1.3 (1.1, 1.7) 1.1 (1.0, 1.2) 0.25 0.5 2 4 Lower risk Higher risk Pancreatic Cancer Likelihood ratio test for heterogeneity Χ² = 2.05, p= 0.076

code
my_labels<-list(agedec ~ "Age (in decades)", race ~ "Race",   poverty ~ "Poverty",
                rurality ~ "Rurality", married ~ "Married", yeardx_cat ~ "Year of diagnosis",
                stage ~ "Stage", surgcat ~ "Surgery", chemo ~ "Chemotherapy",
                radiation ~ "Radiation", state~ "State", 
                pvgroup ~ "Gene group")
gtsummary::tbl_regression(m_fixed,exponentiate=TRUE,label=my_labels)
Characteristic HR1 95% CI1 p-value
Age (in decades) 1.13 1.07, 1.20 <0.001
Race


    NH White
    Asian 1.08 0.88, 1.33 0.5
    Black 0.95 0.75, 1.21 0.7
    Hispanic 1.19 0.98, 1.45 0.076
    Other 1.55 0.73, 3.31 0.3
Sex


    female
    male 1.06 0.93, 1.20 0.4
Poverty


    <10% poverty
    10%-19% poverty 1.16 0.98, 1.38 0.091
    20%+ poverty 1.38 1.08, 1.76 0.009
Rurality


    rural
    urban 1.14 0.89, 1.46 0.3
Married


    No
    Yes 0.89 0.78, 1.03 0.11
Year of diagnosis


    2013
    2014 0.82 0.54, 1.26 0.4
    2015 0.69 0.46, 1.03 0.066
    2016 0.67 0.46, 0.97 0.035
    2017 0.61 0.42, 0.89 0.010
    2018 0.58 0.41, 0.84 0.004
    2019 0.44 0.31, 0.63 <0.001
Stage


    2
    0-1 0.86 0.67, 1.10 0.2
    3 1.05 0.85, 1.31 0.6
    4 1.52 1.25, 1.84 <0.001
Surgery


    No
    Yes 0.43 0.35, 0.52 <0.001
Chemotherapy


    No
    Yes 2.04 1.57, 2.64 <0.001
Radiation


    No
    Yes 0.87 0.70, 1.09 0.2
State


    CA
    GA 1.67 1.41, 1.96 <0.001
Gene group


    Negative
    ATM 0.87 0.58, 1.29 0.5
    BRCA 1/2 0.68 0.52, 0.90 0.006
    CHEK2 0.61 0.33, 1.15 0.13
    Lynch Syndrome 0.32 0.12, 0.85 0.023
    Other PV 1.12 0.89, 1.41 0.3
    PALB2 1.03 0.51, 2.09 >0.9
    VUS only 1.00 0.85, 1.16 >0.9
1 HR = Hazard Ratio, CI = Confidence Interval

Note the decreasing hazard ratio with year of diagnosis that is a product of the spread of genetic testing to lower clinical risk groups as discussed as part of the variable selection rationale in (model-strat?) above.

Tests of heterogeneity

code
fvar<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$fr_var),scientific=FALSE)
ca<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$ca_test),scientific=FALSE)
lrt<-format(as.list(frailtyEM:::summary.emfrail(m_emfrail)$loglik),scientific=FALSE)

p11<-table_fig(fvar,tbl_title="Variance",.9)
p12<-table_fig(ca,tbl_title="Commenges-Andersen test",1)
p13<-table_fig(lrt,tbl_title="Likelihood ratio test",.8)
cowplot::plot_grid(p11,p12,p13,nrow=1)

Arrange plots

Extract legend from dummy plot

code
p_leg<-ggplot(
  datt, aes(x = ord, y = estimate, group = as.factor(type), 
            color = as.factor(type)) ) + 
  geom_point() + geom_line()  +
  scale_color_manual(values = c("#377eb8", "#4daf4a"), name="Estimates", 
                     labels=c("Fixed", "Random"),
                     guide = guide_legend(override.aes = list(size = 3,shape=22,
                                                              fill=c("#377eb8", "#4daf4a")) )
                     )  +
  theme(legend.key = element_rect(fill = "white"), 
        legend.text=element_text(size=12),legend.title=element_text(size=12))

extract_legend<-function(myggplot){
    tmp <- ggplot_gtable(ggplot_build(myggplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    return(legend)
}
my_legend<-extract_legend(p_leg)
#cowplot::plot_grid(p1,my_legend)

Plot 5 figures and legend together for Figure 1.

code
p_final<-cowplot::plot_grid(p1,p2,p3,p4,p5,my_legend, ncol=2,
                            align="AUTO")
save(p_final,file="data/p_final.Rdata")
ggplot2::ggsave(p_final, 
                filename = "final_plot.pdf", 
                path = "img/", 
                units = "in", width =11.6, height = 15, 
                dpi = 300, bg = "white",device=cairo_pdf)

References

Austin, Peter C. 2017. “A Tutorial on Multilevel Survival Analysis: Methods, Models and Applications.” International Statistical Review = Revue Internationale de Statistique 85 (2): 185. https://doi.org/10.1111/insr.12214.
Balan, Theodor Adrian, and Hein Putter. 2019. frailtyEM : An R Package for Estimating Semiparametric Shared Frailty Models.” Journal of Statistical Software 90 (7). https://doi.org/10.18637/jss.v090.i07.
Balan, Theodor A., and Hein Putter. 2020. “A tutorial on frailty models.” Statistical Methods in Medical Research 29 (11): 3424–54. https://doi.org/10.1177/0962280220921889.
Elwert, Felix, and Christopher Winship. 2014. “Endogenous Selection Bias: The Problem of Conditioning on a Collider Variable.” Annual Review of Sociology 40 (1): 31–53. https://doi.org/10.1146/annurev-soc-071913-043455.
Hall, Michael J., Julia E. Reid, Lynn A. Burbidge, Dmitry Pruss, Amie M. Deffenbaugh, Cynthia Frye, Richard J. Wenstrup, Brian E. Ward, Thomas A. Scholl, and Walter W. Noll. 2009. “BRCA1 and BRCA2 Mutations in Women of Different Ethnicities Undergoing Testing for Hereditary Breast-Ovarian Cancer.” Cancer 115 (10): 2222. https://doi.org/10.1002/cncr.24200.
Kurian, Allison W., Paul Abrahamse, Allison Furgal, Kevin C. Ward, Ann S. Hamilton, Rachel Hodan, Rachel Tocco, et al. 2023. “Germline Genetic Testing After Cancer Diagnosis.” JAMA 330 (1): 43–51. https://doi.org/10.1001/jama.2023.9526.
Lipsky, Ari M., and Sander Greenland. 2022. “Causal Directed Acyclic Graphs.” JAMA, February. https://doi.org/10.1001/jama.2022.1816.
McElreath, Richard. 2020. Statistical Rethinking: A Bayesian Course with Examples in r and Stan. 2nd ed. CRC Texts in Statistical Science. Boca Raton: Taylor; Francis, CRC Press.
Pearl, Judea. 1995. “Causal Diagrams for Empirical Research.” Biometrika 82 (4): 669–88. https://doi.org/10.2307/2337329.
———. 2010. “An Introduction to Causal Inference.” The International Journal of Biostatistics 6 (2): 7. https://doi.org/10.2202/1557-4679.1203.
Therneau, Terry M., Thomas Lumley, Elizabeth Atkinson, and Crowson Cynthia. 2024. Survival: Survival Analysis. https://cran.r-project.org/web/packages/survival/index.html.