Last updated: 2019-10-30

File Version Author Date Message
Rmd 550176f Davis McCarthy 2019-10-30 Updating analysis to reflect accepted ms

Load libraries and data

Load the Canopy clone inference results and the cell assignment results from cardelino for 32 donor fibroblast cell lines.

reading euts 
reading fawm 
reading feec 
reading fikt 
reading garx 
reading gesg 
reading heja 
reading hipn 
reading ieki 
reading joxm 
reading kuco 
reading laey 
reading lexy 
reading naju 
reading nusw 
reading oaaz 
reading oilg 
reading pipw 
reading puie 
reading qayj 
reading qolg 
reading qonc 
reading rozh 
reading sehl 
reading ualf 
reading vass 
reading vils 
reading vuna 
reading wahn 
reading wetu 
reading xugn 
reading zoxy 
Warning: `data_frame()` is deprecated, use `tibble()`.
This warning is displayed once per session.
df_prev <-"rbind", prev_list)

lm_eqn <- function(df) {
    m <- lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, df);
    eq <- substitute(~italic(r)^2~"="~r2, 
         list(a = format(coef(m)[1], digits = 2), 
              b = format(coef(m)[2], digits = 2), 
             r2 = format(summary(m)$r.squared, digits = 3)))

## Fit weighted regressions
fits <- df_prev %>%
  group_by(clone) %>% 
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

fits_1grp <- df_prev %>%
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

fits_1grp_filt_n_assigned <- df_prev %>%
  filter(n_assigned > 37) %>%
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

lines_high_tree_agreement <- c("euts",  "hipn", "joxm", "kuco", "naju", "nusw", 
                               "pipw", "rozh", "vuna")
fits_1grp_filt_clonal_tree <- df_prev %>%
  filter(line %in% lines_high_tree_agreement) %>%
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

lines_gt100_vars <- c("joxm", "garx", "wahn", "vass", "ualf", "euts", 
                           "laey", "pipw", "oilg", "heja", "sehl", "feec", 
                           "gesg", "fikt", "vuna", "qonc", "xugn", "qolg", 
fits_1grp_filt_n_vars <- df_prev %>%
  filter(line %in% lines_gt100_vars, n_assigned > 37, prop_assigned > 0.9) %>%
  do(fit = lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, data = .))

le_lin_fit <- function(dat) {
  the_fit <- lm(prev_cardelino ~ prev_canopy, weights = prop_assigned, dat)
  setNames(data.frame(t(coef(the_fit))), c("x0", "x1"))

fits_me <- df_prev %>%
  group_by(clone) %>% 

fits_me_1grp <- df_prev %>%


lm(formula = prev_cardelino ~ prev_canopy, data = ., weights = prop_assigned)

Weighted Residuals:
     Min       1Q   Median       3Q      Max 
-0.35739 -0.09291 -0.02120  0.08594  0.48897 

            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.13862    0.02470   5.612 2.01e-07 ***
prev_canopy  0.59119    0.05731  10.316  < 2e-16 ***
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1385 on 94 degrees of freedom
Multiple R-squared:  0.531, Adjusted R-squared:  0.526 
F-statistic: 106.4 on 1 and 94 DF,  p-value: < 2.2e-16

lm(formula = prev_cardelino ~ prev_canopy, data = .)

    Min      1Q  Median      3Q     Max 
-0.3623 -0.1063 -0.0144  0.0952  0.5147 

            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.12545    0.02652   4.730 7.88e-06 ***
prev_canopy  0.62364    0.06243   9.989  < 2e-16 ***
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1611 on 94 degrees of freedom
Multiple R-squared:  0.5149,    Adjusted R-squared:  0.5097 
F-statistic: 99.77 on 1 and 94 DF,  p-value: < 2.2e-16

lm(formula = prev_cardelino ~ prev_canopy, data = ., weights = prop_assigned)

Weighted Residuals:
     Min       1Q   Median       3Q      Max 
-0.21085 -0.09442 -0.02174  0.07887  0.25286 

            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.13316    0.02323   5.731 2.69e-07 ***
prev_canopy  0.60950    0.05265  11.576  < 2e-16 ***
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1156 on 66 degrees of freedom
Multiple R-squared:   0.67, Adjusted R-squared:  0.665 
F-statistic:   134 on 1 and 66 DF,  p-value: < 2.2e-16

lm(formula = prev_cardelino ~ prev_canopy, data = ., weights = prop_assigned)

Weighted Residuals:
     Min       1Q   Median       3Q      Max 
-0.20840 -0.06909 -0.01078  0.07378  0.22980 

            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.08087    0.03943   2.051   0.0519 .  
prev_canopy  0.78076    0.08538   9.145 4.02e-09 ***
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1104 on 23 degrees of freedom
Multiple R-squared:  0.7843,    Adjusted R-squared:  0.7749 
F-statistic: 83.63 on 1 and 23 DF,  p-value: 4.017e-09

lm(formula = prev_cardelino ~ prev_canopy, data = ., weights = prop_assigned)

Weighted Residuals:
     Min       1Q   Median       3Q      Max 
-0.16384 -0.08919 -0.02214  0.08082  0.25590 

            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.14736    0.03191   4.617 5.67e-05 ***
prev_canopy  0.57032    0.07292   7.821 5.16e-09 ***
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1167 on 33 degrees of freedom
Multiple R-squared:  0.6496,    Adjusted R-squared:  0.6389 
F-statistic: 61.17 on 1 and 33 DF,  p-value: 5.162e-09
# A tibble: 12 x 4
   line   corr n_assigned prop_assigned
   <chr> <dbl>      <dbl>         <dbl>
 1 oilg  0.465         89         0.989
 2 garx  0.522         70         1    
 3 pipw  0.765        107         1    
 4 heja  0.857         50         1    
 5 laey  0.944         55         1    
 6 joxm  0.956         78         0.987
 7 ualf  0.968         89         1    
 8 puie  0.972         41         1    
 9 wahn  0.979         78         0.951
10 euts  0.988         79         1    
11 gesg  0.992        100         0.952
12 vuna  1             71         1    

Plot clone prevalences

Plot the estimated clone fractions from the cells assigned to a clone by cardelino against the estimated clone fractions from Canopy.

Joining, by = c("prev_cardelino", "prev_canopy")

Add a label for the joxm line to this plot.

Joining, by = c("prev_cardelino", "prev_canopy")

We can create the same plot but just using the 75% of cell lines with at least 37 cells assigned.

Joining, by = c("prev_cardelino", "prev_canopy")

If we filter to look at 9 lines with <15% of variants rearranged in the Cardelino clonal tree and at least 37 assigned cells, then we see better agreement again.

Joining, by = c("prev_cardelino", "prev_canopy")

If we look at the 12 lines with at least 100 somatic variants, more than 37 assigned cells and at least 90% of cells assigned by cardelino, we also see higher concordance between cardelino assignment fraction and Canopy prevalence.

Joining, by = c("prev_cardelino", "prev_canopy")

We can also look at the same first plot as above, but now faceted by the different clones.

Joining, by = c("clone", "prev_cardelino", "prev_canopy")

Since there are so few lines with four clones we can also make a version of the figure above with just clone1, clone2 and clone3 and fitted a weighted regression line, with points weighted by the fraction of cells assigned for the line.

Joining, by = c("clone", "prev_cardelino", "prev_canopy")

Let us also make a version of the plot above with the line joxm highlighted as this line is used as an example in the paper.

Joining, by = c("clone", "prev_cardelino", "prev_canopy")

Also look at what happens if we filter out lines that have fewer than 75% of cells assigned (25 lines).

